diff --git a/MANIFEST.in b/MANIFEST.in index 33a48428cf..f277566d40 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -7,5 +7,4 @@ include MANIFEST.in include README.md include requirements.txt -recursive-include mediapipe/modules *.tflite *.txt -recursive-include mediapipe/graphs *.binarypb +recursive-include mediapipe/modules *.tflite *.txt *.binarypb diff --git a/README.md b/README.md index 5191e6d635..263673bb6c 100644 --- a/README.md +++ b/README.md @@ -35,9 +35,9 @@ Object Detection []() | Android | iOS | Desktop | Python | Web | Coral :---------------------------------------------------------------------------------------- | :-----: | :-: | :-----: | :----: | :-: | :---: [Face Detection](https://google.github.io/mediapipe/solutions/face_detection) | ✅ | ✅ | ✅ | | ✅ | ✅ -[Face Mesh](https://google.github.io/mediapipe/solutions/face_mesh) | ✅ | ✅ | ✅ | | | +[Face Mesh](https://google.github.io/mediapipe/solutions/face_mesh) | ✅ | ✅ | ✅ | ✅ | | [Iris](https://google.github.io/mediapipe/solutions/iris) | ✅ | ✅ | ✅ | | ✅ | -[Hands](https://google.github.io/mediapipe/solutions/hands) | ✅ | ✅ | ✅ | | ✅ | +[Hands](https://google.github.io/mediapipe/solutions/hands) | ✅ | ✅ | ✅ | ✅ | ✅ | [Pose](https://google.github.io/mediapipe/solutions/pose) | ✅ | ✅ | ✅ | ✅ | ✅ | [Hair Segmentation](https://google.github.io/mediapipe/solutions/hair_segmentation) | ✅ | | ✅ | | ✅ | [Object Detection](https://google.github.io/mediapipe/solutions/object_detection) | ✅ | ✅ | ✅ | | | ✅ @@ -53,6 +53,19 @@ See also [MediaPipe Models and Model Cards](https://google.github.io/mediapipe/solutions/models) for ML models released in MediaPipe. +## MediaPipe in Python + +MediaPipe Python package is available on +[PyPI](https://pypi.org/project/mediapipe/), and can be installed simply by `pip +install mediapipe` on Linux and macOS, as described in: + +* [MediaPipe Face Mesh](../solutions/pose.md#python) and + [colab](https://mediapipe.page.link/face_mesh_py_colab) +* [MediaPipe Hands](../solutions/pose.md#python) and + [colab](https://mediapipe.page.link/hands_py_colab) +* [MediaPipe Pose](../solutions/pose.md#python) and + [colab](https://mediapipe.page.link/pose_py_colab) + ## MediaPipe on the Web MediaPipe on the Web is an effort to run the same ML solutions built for mobile diff --git a/WORKSPACE b/WORKSPACE index 395d5f52c4..fe0c1da752 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -364,9 +364,9 @@ http_archive( ) #Tensorflow repo should always go after the other external dependencies. -# 2020-08-30 -_TENSORFLOW_GIT_COMMIT = "57b009e31e59bd1a7ae85ef8c0232ed86c9b71db" -_TENSORFLOW_SHA256= "de7f5f06204e057383028c7e53f3b352cdf85b3a40981b1a770c9a415a792c0e" +# 2020-10-30 +_TENSORFLOW_GIT_COMMIT = "84384703c0d8b502e33ff6fd7eefd219dca5ff8e" +_TENSORFLOW_SHA256= "23fb322fc15a20f7a7838d9a31f8b16f60700a494ea654311a0aa8621769df98" http_archive( name = "org_tensorflow", urls = [ diff --git a/build_android_examples.sh b/build_android_examples.sh index 58d6c681ea..37a40ed50e 100644 --- a/build_android_examples.sh +++ b/build_android_examples.sh @@ -93,38 +93,40 @@ for app in ${apps}; do echo "=== Target: ${target}" - if [[ $install_only == false ]]; then - bazel_flags=("${default_bazel_flags[@]}") - bazel_flags+=(${target}) - if [[ $strip == true ]]; then - bazel_flags+=(--linkopt=-s) - fi - - if [[ ${app_name} == "templatematchingcpu" ]]; then - switch_to_opencv_4 - fi - bazel "${bazel_flags[@]}" - cp -f "${bin}" "${apk}" - if [[ ${app_name} == "templatematchingcpu" ]]; then - switch_to_opencv_3 - fi - fi - if [[ ${app_name} == "objectdetection3d" ]]; then - orig_apk=${apk} - apk="${out_dir}/${target_name}_shoes.apk" - cp -f "${orig_apk}" "${apk}" - apks+=(${apk}) - - apk="${out_dir}/${target_name}_chairs.apk" + categories=("shoe" "chair" "cup" "camera" "shoe_1stage" "chair_1stage") + for category in ${categories[@]}; do + apk="${out_dir}/${target_name}_${category}.apk" + if [[ $install_only == false ]]; then + bazel_flags_extended=("${bazel_flags[@]}") + if [[ ${category} != "shoe" ]]; then + bazel_flags_extended+=(--define ${category}=true) + fi + echo "bazel ${bazel_flags_extended[@]}" + bazel "${bazel_flags_extended[@]}" + cp -f "${bin}" "${apk}" + fi + apks+=(${apk}) + done + else if [[ $install_only == false ]]; then - bazel_flags+=(--define chair=true) + bazel_flags=("${default_bazel_flags[@]}") + bazel_flags+=(${target}) + if [[ $strip == true ]]; then + bazel_flags+=(--linkopt=-s) + fi + + if [[ ${app_name} == "templatematchingcpu" ]]; then + switch_to_opencv_4 + fi bazel "${bazel_flags[@]}" cp -f "${bin}" "${apk}" + if [[ ${app_name} == "templatematchingcpu" ]]; then + switch_to_opencv_3 + fi fi + apks+=(${apk}) fi - - apks+=(${apk}) fi done diff --git a/build_desktop_examples.sh b/build_desktop_examples.sh index 7b971f1bc2..36abeb3401 100644 --- a/build_desktop_examples.sh +++ b/build_desktop_examples.sh @@ -86,9 +86,7 @@ for app in ${apps}; do cp -f "${bin_dir}/${app}/"*"_cpu" "${out_dir}" fi if [[ $build_only == false ]]; then - if [[ ${target_name} == "multi_hand_tracking" ]]; then - graph_name="hand_tracking/multi_hand_tracking" - elif [[ ${target_name} == "object_tracking" ]]; then + if [[ ${target_name} == "object_tracking" ]]; then graph_name="tracking/object_detection_tracking" elif [[ ${target_name} == "upper_body_pose_tracking" ]]; then graph_name="pose_tracking/upper_body_pose_tracking" diff --git a/docs/getting_started/android_archive_library.md b/docs/getting_started/android_archive_library.md index 5a333b099a..fc9f90c391 100644 --- a/docs/getting_started/android_archive_library.md +++ b/docs/getting_started/android_archive_library.md @@ -135,6 +135,7 @@ each project. def camerax_version = "1.0.0-beta10" implementation "androidx.camera:camera-core:$camerax_version" implementation "androidx.camera:camera-camera2:$camerax_version" + implementation "androidx.camera:camera-lifecycle:$camerax_version" } ``` diff --git a/docs/getting_started/building_examples.md b/docs/getting_started/building_examples.md index 842f1b1551..301a70117f 100644 --- a/docs/getting_started/building_examples.md +++ b/docs/getting_started/building_examples.md @@ -427,45 +427,13 @@ Note: This currently works only on Linux, and please first follow MediaPipe Python package is available on [PyPI](https://pypi.org/project/mediapipe/), and can be installed simply by `pip -install mediapipe` on Linux and macOS, as described below in -[Run in python interpreter](#run-in-python-interpreter) and in this -[colab](https://mediapipe.page.link/mp-py-colab). +install mediapipe` on Linux and macOS, as described in, for instance, +[Python section in MediaPipe Pose](../solutions/pose.md#python) and in this +[colab](https://mediapipe.page.link/pose_py_colab). -### Run in Python interpreter - -Using [MediaPipe Pose](../solutions/pose.md) as an example: - -```bash -# Activate a Python virtual environment. -$ python3 -m venv mp_env && source mp_env/bin/activate - -# Install MediaPipe Python package -(mp_env)$ pip install mediapipe - -# Run in Python interpreter -(mp_env)$ python3 ->>> import mediapipe as mp ->>> pose_tracker = mp.examples.UpperBodyPoseTracker() - -# For image input ->>> pose_landmarks, _ = pose_tracker.run(input_file='/path/to/input/file', output_file='/path/to/output/file') ->>> pose_landmarks, annotated_image = pose_tracker.run(input_file='/path/to/file') - -# For live camera input -# (Press Esc within the output image window to stop the run or let it self terminate after 30 seconds.) ->>> pose_tracker.run_live() - -# Close the tracker. ->>> pose_tracker.close() -``` - -Tip: Use command `deactivate` to exit the Python virtual environment. - -### Building Python package from source - -Follow these steps only if you have local changes and need to build the Python -package from source. Otherwise, we strongly encourage our users to simply run -`pip install mediapipe`, more convenient and much faster. +Follow the steps below only if you have local changes and need to build the +Python package from source. Otherwise, we strongly encourage our users to simply +run `pip install mediapipe`, more convenient and much faster. 1. Make sure that Bazel and OpenCV are correctly installed and configured for MediaPipe. Please see [Installation](./install.md) for how to setup Bazel diff --git a/docs/getting_started/install.md b/docs/getting_started/install.md index 257c817a93..b83f9a9efb 100644 --- a/docs/getting_started/install.md +++ b/docs/getting_started/install.md @@ -12,7 +12,7 @@ nav_order: 1 {:toc} --- -Note: To interoperate with OpenCV, OpenCV 3.x and above are preferred. OpenCV +Note: To interoperate with OpenCV, OpenCV 3.x to 4.1 are preferred. OpenCV 2.x currently works but interoperability support may be deprecated in the future. diff --git a/docs/images/mobile/objectron_camera_android_gpu.gif b/docs/images/mobile/objectron_camera_android_gpu.gif new file mode 100644 index 0000000000..2ac32104d3 Binary files /dev/null and b/docs/images/mobile/objectron_camera_android_gpu.gif differ diff --git a/docs/images/mobile/objectron_chair_android_gpu.gif b/docs/images/mobile/objectron_chair_android_gpu.gif index abd1652cae..d2e0ef6712 100644 Binary files a/docs/images/mobile/objectron_chair_android_gpu.gif and b/docs/images/mobile/objectron_chair_android_gpu.gif differ diff --git a/docs/images/mobile/objectron_chair_android_gpu_small.gif b/docs/images/mobile/objectron_chair_android_gpu_small.gif index bef4c5b189..919bc03357 100644 Binary files a/docs/images/mobile/objectron_chair_android_gpu_small.gif and b/docs/images/mobile/objectron_chair_android_gpu_small.gif differ diff --git a/docs/images/mobile/objectron_cup_android_gpu.gif b/docs/images/mobile/objectron_cup_android_gpu.gif new file mode 100644 index 0000000000..6b49e8f17e Binary files /dev/null and b/docs/images/mobile/objectron_cup_android_gpu.gif differ diff --git a/docs/images/mobile/objectron_shoe_android_gpu.gif b/docs/images/mobile/objectron_shoe_android_gpu.gif index 117cdc5de4..ad0ae36972 100644 Binary files a/docs/images/mobile/objectron_shoe_android_gpu.gif and b/docs/images/mobile/objectron_shoe_android_gpu.gif differ diff --git a/docs/images/objectron_2stage_network_architecture.png b/docs/images/objectron_2stage_network_architecture.png new file mode 100644 index 0000000000..591f31f64e Binary files /dev/null and b/docs/images/objectron_2stage_network_architecture.png differ diff --git a/docs/index.md b/docs/index.md index 2e6f1f6dad..fc9bf1d6a5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -35,9 +35,9 @@ Object Detection []() | Android | iOS | Desktop | Python | Web | Coral :---------------------------------------------------------------------------------------- | :-----: | :-: | :-----: | :----: | :-: | :---: [Face Detection](https://google.github.io/mediapipe/solutions/face_detection) | ✅ | ✅ | ✅ | | ✅ | ✅ -[Face Mesh](https://google.github.io/mediapipe/solutions/face_mesh) | ✅ | ✅ | ✅ | | | +[Face Mesh](https://google.github.io/mediapipe/solutions/face_mesh) | ✅ | ✅ | ✅ | ✅ | | [Iris](https://google.github.io/mediapipe/solutions/iris) | ✅ | ✅ | ✅ | | ✅ | -[Hands](https://google.github.io/mediapipe/solutions/hands) | ✅ | ✅ | ✅ | | ✅ | +[Hands](https://google.github.io/mediapipe/solutions/hands) | ✅ | ✅ | ✅ | ✅ | ✅ | [Pose](https://google.github.io/mediapipe/solutions/pose) | ✅ | ✅ | ✅ | ✅ | ✅ | [Hair Segmentation](https://google.github.io/mediapipe/solutions/hair_segmentation) | ✅ | | ✅ | | ✅ | [Object Detection](https://google.github.io/mediapipe/solutions/object_detection) | ✅ | ✅ | ✅ | | | ✅ @@ -53,6 +53,19 @@ See also [MediaPipe Models and Model Cards](https://google.github.io/mediapipe/solutions/models) for ML models released in MediaPipe. +## MediaPipe in Python + +MediaPipe Python package is available on +[PyPI](https://pypi.org/project/mediapipe/), and can be installed simply by `pip +install mediapipe` on Linux and macOS, as described in: + +* [MediaPipe Face Mesh](../solutions/pose.md#python) and + [colab](https://mediapipe.page.link/face_mesh_py_colab) +* [MediaPipe Hands](../solutions/pose.md#python) and + [colab](https://mediapipe.page.link/hands_py_colab) +* [MediaPipe Pose](../solutions/pose.md#python) and + [colab](https://mediapipe.page.link/pose_py_colab) + ## MediaPipe on the Web MediaPipe on the Web is an effort to run the same ML solutions built for mobile diff --git a/docs/solutions/face_mesh.md b/docs/solutions/face_mesh.md index d113318b04..dd9c0bc42b 100644 --- a/docs/solutions/face_mesh.md +++ b/docs/solutions/face_mesh.md @@ -254,6 +254,99 @@ and for iOS modify `kNumFaces` in Tip: Maximum number of faces to detect/process is set to 1 by default. To change it, in the graph file modify the option of `ConstantSidePacketCalculator`. +#### Python + +MediaPipe Python package is available on +[PyPI](https://pypi.org/project/mediapipe/), and can be installed simply by `pip +install mediapipe` on Linux and macOS, as described below and in this +[colab](https://mediapipe.page.link/face_mesh_py_colab). If you do need to build +the Python package from source, see +[additional instructions](../getting_started/building_examples.md#python). + +Activate a Python virtual environment: + +```bash +$ python3 -m venv mp_env && source mp_env/bin/activate +``` + +Install MediaPipe Python package: + +```bash +(mp_env)$ pip install mediapipe +``` + +Run the following Python code: + + + +```python +import cv2 +import mediapipe as mp +mp_drawing = mp.solutions.drawing_utils +mp_face_mesh = mp.solutions.face_mesh + +# For static images: +face_mesh = mp_face_mesh.FaceMesh( + static_image_mode=True, + max_num_faces=1, + min_detection_confidence=0.5) +drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1) +for idx, file in enumerate(file_list): + image = cv2.imread(file) + # Convert the BGR image to RGB before processing. + results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + + # Print and draw face mesh landmarks on the image. + if not results.multi_face_landmarks: + continue + annotated_image = image.copy() + for face_landmarks in results.multi_face_landmarks: + print('face_landmarks:', face_landmarks) + mp_drawing.draw_landmarks( + image=annotated_image, + landmark_list=face_landmarks, + connections=mp_face_mesh.FACE_CONNECTIONS, + landmark_drawing_spec=drawing_spec, + connection_drawing_spec=drawing_spec) + cv2.imwrite('/tmp/annotated_image' + str(idx) + '.png', image) +face_mesh.close() + +# For webcam input: +face_mesh = mp_face_mesh.FaceMesh( + min_detection_confidence=0.5, min_tracking_confidence=0.5) +drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1) +cap = cv2.VideoCapture(0) +while cap.isOpened(): + success, image = cap.read() + if not success: + break + + # Flip the image horizontally for a later selfie-view display, and convert + # the BGR image to RGB. + image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB) + # To improve performance, optionally mark the image as not writeable to + # pass by reference. + image.flags.writeable = False + results = face_mesh.process(image) + + # Draw the face mesh annotations on the image. + image.flags.writeable = True + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + if results.multi_face_landmarks: + for face_landmarks in results.multi_face_landmarks: + mp_drawing.draw_landmarks( + image=image, + landmark_list=face_landmarks, + connections=mp_face_mesh.FACE_CONNECTIONS, + landmark_drawing_spec=drawing_spec, + connection_drawing_spec=drawing_spec) + cv2.imshow('MediaPipe FaceMesh', image) + if cv2.waitKey(5) & 0xFF == 27: + break +face_mesh.close() +cap.release() +``` + ### Face Effect Example Face effect example showcases real-time mobile face effect application use case diff --git a/docs/solutions/hands.md b/docs/solutions/hands.md index 7e49fba69a..f6025fb1b3 100644 --- a/docs/solutions/hands.md +++ b/docs/solutions/hands.md @@ -55,13 +55,21 @@ frame, and only when the landmark model could no longer identify hand presence is palm detection invoked to relocalize the hand. The pipeline is implemented as a MediaPipe -[graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/hand_tracking_mobile.pbtxt), -which internally utilizes a -[palm/hand detection subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/subgraphs/hand_detection_gpu.pbtxt), -a -[hand landmark subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_gpu.pbtxt) -and a -[renderer subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/subgraphs/renderer_gpu.pbtxt). +[graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/hand_tracking_mobile.pbtxt) +that uses a +[hand landmark tracking subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark/hand_landmark_tracking_gpu.pbtxt) +from the +[hand landmark module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark), +and renders using a dedicated +[hand renderer subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/subgraphs/hand_renderer_gpu.pbtxt). +The +[hand landmark tracking subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark/hand_landmark_tracking_gpu.pbtxt) +internally uses a +[hand landmark subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark/hand_landmark_gpu.pbtxt) +from the same module and a +[palm detection subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/palm_detection/palm_detection_gpu.pbtxt) +from the +[palm detection module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/palm_detection). Note: To visualize a graph, copy the graph and paste it into [MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how @@ -146,34 +154,11 @@ to visualize its associated subgraphs, please see * iOS target: [`mediapipe/examples/ios/handtrackinggpu:HandTrackingGpuApp`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/ios/handtrackinggpu/BUILD) -#### With Multi-hand Support - -* Graph: - [`mediapipe/graphs/hand_tracking/multi_hand_tracking_mobile.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/multi_hand_tracking_mobile.pbtxt) -* Android target: - [(or download prebuilt ARM64 APK)](https://drive.google.com/open?id=1Wk6V9EVaz1ks_MInPqqVGvvJD01SGXDc) - [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/multihandtrackinggpu:multihandtrackinggpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/multihandtrackinggpu/BUILD) -* iOS target: - [`mediapipe/examples/ios/multihandtrackinggpu:MultiHandTrackingGpuApp`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/ios/multihandtrackinggpu/BUILD) - -There are two key differences between this graph and that in the -[main example](#main-example) (which handles only one hand): - -1. There is a `NormalizedRectVectorHasMinSize` calculator, that checks if in - input vector of `NormalizedRect` objects has a minimum size equal to `N`. In - this graph, if the vector contains fewer than `N` objects, - `MultiHandDetection` subgraph runs. Otherwise, the `GateCalculator` doesn't - send any image packets to the `MultiHandDetection` subgraph. This way, the - main graph is efficient in that it avoids running the costly hand detection - step when there are already `N` hands in the frame. -2. The `MergeCalculator` has been replaced by the `AssociationNormRect` - calculator. This `AssociationNormRect` takes as input a vector of - `NormalizedRect` objects from the `MultiHandDetection` subgraph on the - current frame, and a vector of `NormalizedRect` objects from the - `MultiHandLandmark` subgraph from the previous frame, and performs an - association operation between these objects. This calculator ensures that - the output vector doesn't contain overlapping regions based on the specified - `min_similarity_threshold`. +Tip: Maximum number of hands to detect/process is set to 2 by default. To change +it, for Android modify `NUM_HANDS` in +[MainActivity.java](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/MainActivity.java), +and for iOS modify `kNumHands` in +[HandTrackingViewController.mm](https://github.com/google/mediapipe/tree/master/mediapipe/examples/ios/handtrackinggpu/HandTrackingViewController.mm). #### Palm/Hand Detection Only (no landmarks) @@ -187,8 +172,6 @@ There are two key differences between this graph and that in the ### Desktop -#### Main Example - * Running on CPU * Graph: [`mediapipe/graphs/hand_tracking/hand_tracking_desktop_live.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/hand_tracking_desktop_live.pbtxt) @@ -196,22 +179,101 @@ There are two key differences between this graph and that in the [`mediapipe/examples/desktop/hand_tracking:hand_tracking_cpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/hand_tracking/BUILD) * Running on GPU * Graph: - [`mediapipe/graphs/hand_tracking/hand_tracking_mobile.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/hand_tracking_mobile.pbtxt) + [`mediapipe/graphs/hand_tracking/hand_tracking_desktop_live_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/hand_tracking_desktop_gpu.pbtxt) * Target: [`mediapipe/examples/desktop/hand_tracking:hand_tracking_gpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/hand_tracking/BUILD) -#### With Multi-hand Support - -* Running on CPU - * Graph: - [`mediapipe/graphs/hand_tracking/multi_hand_tracking_desktop_live.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/multi_hand_tracking_desktop_live) - * Target: - [`mediapipe/examples/desktop/multi_hand_tracking:multi_hand_tracking_cpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/multi_hand_tracking/BUILD) -* Running on GPU - * Graph: - [`mediapipe/graphs/hand_tracking/multi_hand_tracking_mobile.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/multi_hand_tracking_mobile.pbtxt) - * Target: - [`mediapipe/examples/desktop/multi_hand_tracking:multi_hand_tracking_gpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/multi_hand_tracking/BUILD) +Tip: Maximum number of hands to detect/process is set to 2 by default. To change +it, in the graph file modify the option of `ConstantSidePacketCalculator`. + +### Python + +MediaPipe Python package is available on +[PyPI](https://pypi.org/project/mediapipe/), and can be installed simply by `pip +install mediapipe` on Linux and macOS, as described below and in this +[colab](https://mediapipe.page.link/hands_py_colab). If you do need to build the +Python package from source, see +[additional instructions](../getting_started/building_examples.md#python). + +Activate a Python virtual environment: + +```bash +$ python3 -m venv mp_env && source mp_env/bin/activate +``` + +Install MediaPipe Python package: + +```bash +(mp_env)$ pip install mediapipe +``` + +Run the following Python code: + + + +```python +import cv2 +import mediapipe as mp +mp_drawing = mp.solutions.drawing_utils +mp_hands = mp.solutions.hands + +# For static images: +hands = mp_hands.Hands( + static_image_mode=True, + max_num_hands=2, + min_detection_confidence=0.7) +for idx, file in enumerate(file_list): + # Read an image, flip it around y-axis for correct handedness output (see + # above). + image = cv2.flip(cv2.imread(file), 1) + # Convert the BGR image to RGB before processing. + results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + + # Print handedness and draw hand landmarks on the image. + print('handedness:', results.multi_handedness) + if not results.multi_hand_landmarks: + continue + annotated_image = image.copy() + for hand_landmarks in results.multi_hand_landmarks: + print('hand_landmarks:', hand_landmarks) + mp_drawing.draw_landmarks( + annotated_image, hand_landmarks, mp_hands.HAND_CONNECTIONS) + cv2.imwrite( + '/tmp/annotated_image' + str(idx) + '.png', cv2.flip(image, 1)) +hands.close() + +# For webcam input: +hands = mp_hands.Hands( + min_detection_confidence=0.7, min_tracking_confidence=0.5) +cap = cv2.VideoCapture(0) +while cap.isOpened(): + success, image = cap.read() + if not success: + break + + # Flip the image horizontally for a later selfie-view display, and convert + # the BGR image to RGB. + image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB) + # To improve performance, optionally mark the image as not writeable to + # pass by reference. + image.flags.writeable = False + results = hands.process(image) + + # Draw the hand annotations on the image. + image.flags.writeable = True + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + if results.multi_hand_landmarks: + for hand_landmarks in results.multi_hand_landmarks: + mp_drawing.draw_landmarks( + image, hand_landmarks, mp_hands.HAND_CONNECTIONS) + cv2.imshow('MediaPipe Hands', image) + if cv2.waitKey(5) & 0xFF == 27: + break +hands.close() +cap.release() +``` + +Tip: Use command `deactivate` to exit the Python virtual environment. ### Web diff --git a/docs/solutions/models.md b/docs/solutions/models.md index 2ba18a750c..9cc6e1fecf 100644 --- a/docs/solutions/models.md +++ b/docs/solutions/models.md @@ -37,10 +37,10 @@ nav_order: 30 ### [Hands](https://google.github.io/mediapipe/solutions/hands) * Palm detection model: - [TFLite model](https://github.com/google/mediapipe/tree/master/mediapipe/models/palm_detection.tflite), + [TFLite model](https://github.com/google/mediapipe/tree/master/mediapipe/modules/palm_detection/palm_detection.tflite), [TF.js model](https://tfhub.dev/mediapipe/handdetector/1) * Hand landmark model: - [TFLite model](https://github.com/google/mediapipe/tree/master/mediapipe/models/hand_landmark.tflite), + [TFLite model](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark/hand_landmark.tflite), [TF.js model](https://tfhub.dev/mediapipe/handskeleton/1) * [Model card](https://mediapipe.page.link/handmc) @@ -68,6 +68,11 @@ nav_order: 30 * [TFLite model for shoes](https://github.com/google/mediapipe/tree/master/mediapipe/models/object_detection_3d_sneakers.tflite) * [TFLite model for chairs](https://github.com/google/mediapipe/tree/master/mediapipe/models/object_detection_3d_chair.tflite) +* [TFLite model for cameras](https://github.com/google/mediapipe/tree/master/mediapipe/models/object_detection_3d_camera.tflite) +* [TFLite model for cups](https://github.com/google/mediapipe/tree/master/mediapipe/models/object_detection_3d_cup.tflite) +* [Single-stage TFLite model for shoes](https://github.com/google/mediapipe/tree/master/mediapipe/models/object_detection_3d_sneakers_1stage.tflite) +* [Single-stage TFLite model for chairs](https://github.com/google/mediapipe/tree/master/mediapipe/models/object_detection_3d_chair_1stage.tflite) +* [Model card](https://mediapipe.page.link/objectron-mc) ### [KNIFT](https://google.github.io/mediapipe/solutions/knift) diff --git a/docs/solutions/objectron.md b/docs/solutions/objectron.md index d6abcbe75e..d13c9ff2c3 100644 --- a/docs/solutions/objectron.md +++ b/docs/solutions/objectron.md @@ -15,13 +15,12 @@ nav_order: 10 ## Overview MediaPipe Objectron is a mobile real-time 3D object detection solution for -everyday objects. It detects objects in 2D images, and estimates their poses and -sizes through a machine learning (ML) model, trained on a newly created 3D -dataset. +everyday objects. It detects objects in 2D images, and estimates their poses +through a machine learning (ML) model, trained on a newly created 3D dataset. -![objectron_shoe_android_gpu.gif](../images/mobile/objectron_shoe_android_gpu.gif) | ![objectron_chair_android_gpu.gif](../images/mobile/objectron_chair_android_gpu.gif) -:--------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------: -*Fig 1(a). Objectron for Shoes.* | *Fig 1(b). Objectron for Chairs.* +![objectron_shoe_android_gpu.gif](../images/mobile/objectron_shoe_android_gpu.gif) | ![objectron_chair_android_gpu.gif](../images/mobile/objectron_chair_android_gpu.gif) | ![objectron_camera_android_gpu.gif](../images/mobile/objectron_camera_android_gpu.gif) | ![objectron_cup_android_gpu.gif](../images/mobile/objectron_cup_android_gpu.gif) +:--------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------: +*Fig 1(a). Objectron for Shoes.* | *Fig 1(b). Objectron for Chairs.* | *Fig 1(c). Objectron for Cameras.* | *Fig 1(d). Objectron for Cups.* Object detection is an extensively studied computer vision problem, but most of the research has focused on @@ -85,15 +84,41 @@ able to increase the accuracy by about 10%. :-------------------------------------------------------------------------------------------: | *Fig 4. An example of AR synthetic data generation. The virtual white-brown cereal box is rendered into the real scene, next to the real blue book.* | -## ML Model for 3D Object Detection +## ML Pipelines for 3D Object Detection + +We built two ML pipelines to predict the 3D bounding box of an object from a +single RGB image: one is a two-stage pipeline and the other is a single-stage +pipeline. The two-stage pipeline is 3x faster than the single-stage pipeline +with similar or better accuracy. The single stage pipeline is good at detecting +multiple objects, whereas the two stage pipeline is good for a single dominant +object. + +### Two-stage Pipeline + +Our two-stage pipeline is illustrated by the diagram in Fig 5. The first stage +uses an object detector to find the 2D crop of the object. The second stage +takes the image crop and estimates the 3D bounding box. At the same time, it +also computes the 2D crop of the object for the next frame, such that the object +detector does not need to run every frame. + +![objectron_network_architecture.png](../images/objectron_2stage_network_architecture.png) | +:----------------------------------------------------------------------------------------: | +*Fig 5. Network architecture and post-processing for two-stage 3D object detection.* | + +We can use any 2D object detector for the first stage. In this solution, we use +[TensorFlow Object Detection](https://github.com/tensorflow/models/tree/master/research/object_detection). +The second stage 3D bounding box predictor we released runs 83FPS on Adreno 650 +mobile GPU. + +### Single-stage Pipeline ![objectron_network_architecture.png](../images/objectron_network_architecture.png) | :---------------------------------------------------------------------------------: | -*Fig 5. Network architecture and post-processing for 3D object detection.* | +*Fig 6. Network architecture and post-processing for single-stage 3D object detection.* | -We [built a single-stage model](https://arxiv.org/abs/2003.03522) to predict the -pose and physical size of an object from a single RGB image. The model backbone -has an encoder-decoder architecture, built upon +Our [single-stage pipeline](https://arxiv.org/abs/2003.03522) is illustrated by +the diagram in Fig 6, the model backbone has an encoder-decoder architecture, +built upon [MobileNetv2](https://ai.googleblog.com/2018/04/mobilenetv2-next-generation-of-on.html). We employ a multi-task learning approach, jointly predicting an object's shape with detection and regression. The shape task predicts the object's shape @@ -114,9 +139,9 @@ size of the object. The model is light enough to run real-time on mobile devices ![objectron_sample_network_results.png](../images/objectron_sample_network_results.png) | :-------------------------------------------------------------------------------------: | -*Fig 6. Sample results of our network — (Left) original 2D image with estimated bounding boxes, (Middle) object detection by Gaussian distribution, (Right) predicted segmentation mask.* | +*Fig 7. Sample results of our network — (Left) original 2D image with estimated bounding boxes, (Middle) object detection by Gaussian distribution, (Right) predicted segmentation mask.* | -## Detection and Tracking Pipeline +#### Detection and Tracking When the model is applied to every frame captured by the mobile device, it can suffer from jitter due to the ambiguity of the 3D bounding box estimated in each @@ -130,7 +155,7 @@ temporally consistent, reducing the jitter. The Objectron 3D object detection and tracking pipeline is implemented as a MediaPipe -[graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt), +[graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/object_occlusion_tracking_1stage.pbtxt), which internally uses a [detection subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/subgraphs/objectron_detection_gpu.pbtxt) and a @@ -147,6 +172,12 @@ new detection becomes available from the detection subgraph, the tracking subgraph is also responsible for consolidation between the detection and tracking results, based on the area of overlap. +## Objectron Dataset + +We also released our [Objectron dataset](http://objectron.dev), with which we +trained our 3D object detection models. The technical details of the Objectron +dataset, including usage and tutorials, are available on the dataset website. + ## Example Apps Please first see general instructions for @@ -158,32 +189,72 @@ Note: To visualize a graph, copy the graph and paste it into to visualize its associated subgraphs, please see [visualizer documentation](../tools/visualizer.md). -### Objectron for Shoes +### Two-stage Objectron * Graph: - [`mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt) + [`mediapipe/graphs/object_detection_3d/object_occlusion_tracking.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/object_occlusion_tracking.pbtxt) + * Android target: - [(or download prebuilt ARM64 APK)](https://drive.google.com/open?id=1S0K4hbWt3o31FfQ4QU3Rz7IHrvOUMx1d) - [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/BUILD) + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/BUILD). + + Build for **shoes** (default) with: + [(or download prebuilt ARM64 APK)](https://drive.google.com/file/d/1ANW9WDOCb8QO1r8gDC03A4UgrPkICdPP/view?usp=sharing) + + ```bash + bazel build -c opt --config android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d + ``` + + Build for **chairs** with: + [(or download prebuilt ARM64 APK)](https://drive.google.com/file/d/1lcUv1TBnv_SxnKSQwdOqbdLa9mkaTJHy/view?usp=sharing) + + ```bash + bazel build -c opt --config android_arm64 --define chair=true mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d + ``` + + Build for **cups** with: + [(or download prebuilt ARM64 APK)](https://drive.google.com/file/d/1bf77KDkowwrduleiC9B1M1XnEhjnOQbX/view?usp=sharing) + + ```bash + bazel build -c opt --config android_arm64 --define cup=true mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d + ``` + + Build for **cameras** with: + [(or download prebuilt ARM64 APK)](https://drive.google.com/file/d/1GM7lPO-s5URVxIzQur1bLsionEJs3yIl/view?usp=sharing) + + ```bash + bazel build -c opt --config android_arm64 --define camera=true mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d + ``` + * iOS target: Not available -### Objectron for Chairs +### Single-stage Objectron * Graph: - [`mediapipe/graphs/hair_segmentation/hair_segmentation_mobile_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/chair_classic_occlusion_tracking.pbtxt) + [`mediapipe/graphs/object_detection_3d/object_occlusion_tracking_1stage.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/object_occlusion_tracking.pbtxt) + * Android target: - [(or download prebuilt ARM64 APK)](https://drive.google.com/open?id=1MM8K-13bXLCVS1EHQ-KgkVyEahEPrKej) - [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/BUILD) - and add `--define chair=true` to the build command, i.e., + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/BUILD). + + Build with **single-stage** model for **shoes** with: + [(or download prebuilt ARM64 APK)](https://drive.google.com/file/d/1MvaEg4dkvKN8jAU1Z2GtudyXi1rQHYsE/view?usp=sharing) ```bash - bazel build -c opt --config android_arm64 --define chair=true mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d + bazel build -c opt --config android_arm64 --define shoe_1stage=true mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d + ``` + + Build with **single-stage** model for **chairs** with: + [(or download prebuilt ARM64 APK)](https://drive.google.com/file/d/1GJL4z3jr-wD1jMHGd4NBfOG-Yoq5t167/view?usp=sharing) + + ```bash + bazel build -c opt --config android_arm64 --define chair_1stage=true mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d ``` * iOS target: Not available ## Resources +* Google AI Blog: + [Announcing the Objectron Dataset](https://mediapipe.page.link/objectron_dataset_ai_blog) * Google AI Blog: [Real-Time 3D Object Detection on Mobile Devices with MediaPipe](https://ai.googleblog.com/2020/03/real-time-3d-object-detection-on-mobile.html) * Paper: [MobilePose: Real-Time Pose Estimation for Unseen Objects with Weak diff --git a/docs/solutions/pose.md b/docs/solutions/pose.md index cb20f22725..c9c7f71592 100644 --- a/docs/solutions/pose.md +++ b/docs/solutions/pose.md @@ -5,7 +5,7 @@ parent: Solutions nav_order: 5 --- -# MediaPipe BlazePose +# MediaPipe Pose {: .no_toc } 1. TOC @@ -88,12 +88,11 @@ hip midpoints. ### Pose Landmark Model (BlazePose Tracker) The landmark model currently included in MediaPipe Pose predicts the location of -25 upper-body landmarks (see figure below), each with `(x, y, z, visibility)`, -plus two virtual alignment keypoints. Note that the `z` value should be -discarded as the model is currently not fully trained to predict depth, but this -is something we have on the roadmap. The model shares the same architecture as -the full-body version that predicts 33 landmarks, described in more detail in -the +25 upper-body landmarks (see figure below), each with `(x, y, z, visibility)`. +Note that the `z` value should be discarded as the model is currently not fully +trained to predict depth, but this is something we have on the roadmap. The +model shares the same architecture as the full-body version that predicts 33 +landmarks, described in more detail in the [BlazePose Google AI Blog](https://ai.googleblog.com/2020/08/on-device-real-time-body-pose-tracking.html) and in this [paper](https://arxiv.org/abs/2006.10204). @@ -147,35 +146,77 @@ MediaPipe examples. MediaPipe Python package is available on [PyPI](https://pypi.org/project/mediapipe/), and can be installed simply by `pip install mediapipe` on Linux and macOS, as described below and in this -[colab](https://mediapipe.page.link/mp-py-colab). If you do need to build the +[colab](https://mediapipe.page.link/pose_py_colab). If you do need to build the Python package from source, see [additional instructions](../getting_started/building_examples.md#python). +Activate a Python virtual environment: + ```bash -# Activate a Python virtual environment. $ python3 -m venv mp_env && source mp_env/bin/activate +``` -# Install MediaPipe Python package -(mp_env)$ pip install mediapipe - -# Run in Python interpreter -(mp_env)$ python3 ->>> import mediapipe as mp ->>> pose_tracker = mp.examples.UpperBodyPoseTracker() - -# For image input ->>> pose_landmarks, _ = pose_tracker.run(input_file='/path/to/input/file', output_file='/path/to/output/file') ->>> pose_landmarks, annotated_image = pose_tracker.run(input_file='/path/to/file') -# To print out the pose landmarks, you can simply do "print(pose_landmarks)". -# However, the data points can be more accessible with the following approach. ->>> [print('x is', data_point.x, 'y is', data_point.y, 'z is', data_point.z, 'visibility is', data_point.visibility) for data_point in pose_landmarks.landmark] +Install MediaPipe Python package: -# For live camera input -# (Press Esc within the output image window to stop the run or let it self terminate after 30 seconds.) ->>> pose_tracker.run_live() +```bash +(mp_env)$ pip install mediapipe +``` -# Close the tracker. ->>> pose_tracker.close() +Run the following Python code: + + + +```python +import cv2 +import mediapipe as mp +mp_drawing = mp.solutions.drawing_utils +mp_pose = mp.solutions.pose + +# For static images: +pose = mp_pose.Pose( + static_image_mode=True, min_detection_confidence=0.5) +for idx, file in enumerate(file_list): + image = cv2.imread(file) + # Convert the BGR image to RGB before processing. + results = pose.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + + # Print and draw pose landmarks on the image. + print( + 'nose landmark:', + results.pose_landmarks.landmark[mp_pose.PoseLandmark.NOSE]) + annotated_image = image.copy() + mp_drawing.draw_landmarks( + annotated_image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS) + cv2.imwrite('/tmp/annotated_image' + str(idx) + '.png', image) +pose.close() + +# For webcam input: +pose = mp_pose.Pose( + min_detection_confidence=0.5, min_tracking_confidence=0.5) +cap = cv2.VideoCapture(0) +while cap.isOpened(): + success, image = cap.read() + if not success: + break + + # Flip the image horizontally for a later selfie-view display, and convert + # the BGR image to RGB. + image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB) + # To improve performance, optionally mark the image as not writeable to + # pass by reference. + image.flags.writeable = False + results = pose.process(image) + + # Draw the pose annotation on the image. + image.flags.writeable = True + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + mp_drawing.draw_landmarks( + image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS) + cv2.imshow('MediaPipe Pose', image) + if cv2.waitKey(5) & 0xFF == 27: + break +pose.close() +cap.release() ``` Tip: Use command `deactivate` to exit the Python virtual environment. diff --git a/docs/solutions/solutions.md b/docs/solutions/solutions.md index 66e64ef75f..efb853cee8 100644 --- a/docs/solutions/solutions.md +++ b/docs/solutions/solutions.md @@ -19,9 +19,9 @@ has_toc: false []() | Android | iOS | Desktop | Python | Web | Coral :---------------------------------------------------------------------------------------- | :-----: | :-: | :-----: | :----: | :-: | :---: [Face Detection](https://google.github.io/mediapipe/solutions/face_detection) | ✅ | ✅ | ✅ | | ✅ | ✅ -[Face Mesh](https://google.github.io/mediapipe/solutions/face_mesh) | ✅ | ✅ | ✅ | | | +[Face Mesh](https://google.github.io/mediapipe/solutions/face_mesh) | ✅ | ✅ | ✅ | ✅ | | [Iris](https://google.github.io/mediapipe/solutions/iris) | ✅ | ✅ | ✅ | | ✅ | -[Hands](https://google.github.io/mediapipe/solutions/hands) | ✅ | ✅ | ✅ | | ✅ | +[Hands](https://google.github.io/mediapipe/solutions/hands) | ✅ | ✅ | ✅ | ✅ | ✅ | [Pose](https://google.github.io/mediapipe/solutions/pose) | ✅ | ✅ | ✅ | ✅ | ✅ | [Hair Segmentation](https://google.github.io/mediapipe/solutions/hair_segmentation) | ✅ | | ✅ | | ✅ | [Object Detection](https://google.github.io/mediapipe/solutions/object_detection) | ✅ | ✅ | ✅ | | | ✅ diff --git a/mediapipe/MediaPipe.tulsiproj/Configs/MediaPipe.tulsigen b/mediapipe/MediaPipe.tulsiproj/Configs/MediaPipe.tulsigen index f35ed538a5..95664a8ca7 100644 --- a/mediapipe/MediaPipe.tulsiproj/Configs/MediaPipe.tulsigen +++ b/mediapipe/MediaPipe.tulsiproj/Configs/MediaPipe.tulsigen @@ -15,7 +15,6 @@ "mediapipe/examples/ios/handdetectiongpu/BUILD", "mediapipe/examples/ios/handtrackinggpu/BUILD", "mediapipe/examples/ios/iristrackinggpu/BUILD", - "mediapipe/examples/ios/multihandtrackinggpu/BUILD", "mediapipe/examples/ios/objectdetectioncpu/BUILD", "mediapipe/examples/ios/objectdetectiongpu/BUILD", "mediapipe/examples/ios/upperbodyposetrackinggpu/BUILD" @@ -29,7 +28,6 @@ "//mediapipe/examples/ios/handdetectiongpu:HandDetectionGpuApp", "//mediapipe/examples/ios/handtrackinggpu:HandTrackingGpuApp", "//mediapipe/examples/ios/iristrackinggpu:IrisTrackingGpuApp", - "//mediapipe/examples/ios/multihandtrackinggpu:MultiHandTrackingGpuApp", "//mediapipe/examples/ios/objectdetectioncpu:ObjectDetectionCpuApp", "//mediapipe/examples/ios/objectdetectiongpu:ObjectDetectionGpuApp", "//mediapipe/examples/ios/upperbodyposetrackinggpu:UpperBodyPoseTrackingGpuApp", @@ -97,7 +95,6 @@ "mediapipe/examples/ios/handdetectiongpu", "mediapipe/examples/ios/handtrackinggpu", "mediapipe/examples/ios/iristrackinggpu", - "mediapipe/examples/ios/multihandtrackinggpu", "mediapipe/examples/ios/objectdetectioncpu", "mediapipe/examples/ios/objectdetectiongpu", "mediapipe/examples/ios/upperbodyposetrackinggpu", diff --git a/mediapipe/MediaPipe.tulsiproj/project.tulsiconf b/mediapipe/MediaPipe.tulsiproj/project.tulsiconf index f202cedde7..241b121ba1 100644 --- a/mediapipe/MediaPipe.tulsiproj/project.tulsiconf +++ b/mediapipe/MediaPipe.tulsiproj/project.tulsiconf @@ -18,7 +18,6 @@ "mediapipe/examples/ios/handdetectiongpu", "mediapipe/examples/ios/handtrackinggpu", "mediapipe/examples/ios/iristrackinggpu", - "mediapipe/examples/ios/multihandtrackinggpu", "mediapipe/examples/ios/objectdetectioncpu", "mediapipe/examples/ios/objectdetectiongpu", "mediapipe/examples/ios/upperbodyposetrackinggpu" diff --git a/mediapipe/calculators/core/BUILD b/mediapipe/calculators/core/BUILD index 57d8da7092..bda9700b96 100644 --- a/mediapipe/calculators/core/BUILD +++ b/mediapipe/calculators/core/BUILD @@ -116,6 +116,7 @@ mediapipe_proto_library( deps = [ "//mediapipe/framework:calculator_options_proto", "//mediapipe/framework:calculator_proto", + "//mediapipe/framework/formats:classification_proto", ], ) @@ -240,6 +241,7 @@ cc_library( "//mediapipe/framework:calculator_framework", "//mediapipe/framework/formats:classification_cc_proto", "//mediapipe/framework/formats:landmark_cc_proto", + "//mediapipe/framework/formats:tensor", "//mediapipe/framework/port:integral_types", "//mediapipe/framework/port:ret_check", "//mediapipe/framework/port:status", @@ -800,14 +802,23 @@ cc_library( name = "split_vector_calculator", srcs = ["split_vector_calculator.cc"], hdrs = ["split_vector_calculator.h"], + copts = select({ + "//mediapipe:apple": [ + "-x objective-c++", + "-fobjc-arc", # enable reference-counting + ], + "//conditions:default": [], + }), visibility = ["//visibility:public"], deps = [ ":split_vector_calculator_cc_proto", "//mediapipe/framework/formats:detection_cc_proto", "//mediapipe/framework:calculator_framework", "//mediapipe/framework/formats:landmark_cc_proto", + "//mediapipe/framework/formats:classification_cc_proto", "//mediapipe/framework/formats:rect_cc_proto", "//mediapipe/framework/formats:matrix", + "//mediapipe/framework/formats:tensor", "//mediapipe/framework/port:ret_check", "//mediapipe/framework/port:status", "//mediapipe/util:resource_util", @@ -1069,6 +1080,7 @@ cc_library( ":constant_side_packet_calculator_cc_proto", "//mediapipe/framework:calculator_framework", "//mediapipe/framework:collection_item_id", + "//mediapipe/framework/formats:classification_cc_proto", "//mediapipe/framework/port:integral_types", "//mediapipe/framework/port:ret_check", "//mediapipe/framework/port:status", diff --git a/mediapipe/calculators/core/clip_vector_size_calculator.proto b/mediapipe/calculators/core/clip_vector_size_calculator.proto index 5dea660d66..6044f77c86 100644 --- a/mediapipe/calculators/core/clip_vector_size_calculator.proto +++ b/mediapipe/calculators/core/clip_vector_size_calculator.proto @@ -18,6 +18,8 @@ package mediapipe; import "mediapipe/framework/calculator.proto"; +option objc_class_prefix = "MediaPipe"; + message ClipVectorSizeCalculatorOptions { extend CalculatorOptions { optional ClipVectorSizeCalculatorOptions ext = 274674998; diff --git a/mediapipe/calculators/core/concatenate_vector_calculator.cc b/mediapipe/calculators/core/concatenate_vector_calculator.cc index 93fa5b3a3b..3094250801 100644 --- a/mediapipe/calculators/core/concatenate_vector_calculator.cc +++ b/mediapipe/calculators/core/concatenate_vector_calculator.cc @@ -18,6 +18,7 @@ #include "mediapipe/framework/formats/classification.pb.h" #include "mediapipe/framework/formats/landmark.pb.h" +#include "mediapipe/framework/formats/tensor.h" #include "mediapipe/framework/port/integral_types.h" #include "tensorflow/lite/interpreter.h" @@ -64,6 +65,9 @@ typedef ConcatenateVectorCalculator ConcatenateTfLiteTensorVectorCalculator; REGISTER_CALCULATOR(ConcatenateTfLiteTensorVectorCalculator); +typedef ConcatenateVectorCalculator ConcatenateTensorVectorCalculator; +REGISTER_CALCULATOR(ConcatenateTensorVectorCalculator); + typedef ConcatenateVectorCalculator<::mediapipe::NormalizedLandmark> ConcatenateLandmarkVectorCalculator; REGISTER_CALCULATOR(ConcatenateLandmarkVectorCalculator); diff --git a/mediapipe/calculators/core/concatenate_vector_calculator.proto b/mediapipe/calculators/core/concatenate_vector_calculator.proto index bddb8af959..3753ffb5d1 100644 --- a/mediapipe/calculators/core/concatenate_vector_calculator.proto +++ b/mediapipe/calculators/core/concatenate_vector_calculator.proto @@ -18,6 +18,8 @@ package mediapipe; import "mediapipe/framework/calculator.proto"; +option objc_class_prefix = "MediaPipe"; + message ConcatenateVectorCalculatorOptions { extend CalculatorOptions { optional ConcatenateVectorCalculatorOptions ext = 259397839; diff --git a/mediapipe/calculators/core/constant_side_packet_calculator.cc b/mediapipe/calculators/core/constant_side_packet_calculator.cc index 7541ccd660..213264dc1e 100644 --- a/mediapipe/calculators/core/constant_side_packet_calculator.cc +++ b/mediapipe/calculators/core/constant_side_packet_calculator.cc @@ -17,6 +17,7 @@ #include "mediapipe/calculators/core/constant_side_packet_calculator.pb.h" #include "mediapipe/framework/calculator_framework.h" #include "mediapipe/framework/collection_item_id.h" +#include "mediapipe/framework/formats/classification.pb.h" #include "mediapipe/framework/port/canonical_errors.h" #include "mediapipe/framework/port/integral_types.h" #include "mediapipe/framework/port/ret_check.h" @@ -24,6 +25,8 @@ namespace mediapipe { +namespace {} // namespace + // Generates an output side packet or multiple output side packets according to // the specified options. // @@ -74,6 +77,8 @@ class ConstantSidePacketCalculator : public CalculatorBase { packet.Set(); } else if (packet_options.has_uint64_value()) { packet.Set(); + } else if (packet_options.has_classification_list_value()) { + packet.Set(); } else { return ::mediapipe::InvalidArgumentError( "None of supported values were specified in options."); @@ -100,6 +105,9 @@ class ConstantSidePacketCalculator : public CalculatorBase { packet.Set(MakePacket(packet_options.string_value())); } else if (packet_options.has_uint64_value()) { packet.Set(MakePacket(packet_options.uint64_value())); + } else if (packet_options.has_classification_list_value()) { + packet.Set(MakePacket( + packet_options.classification_list_value())); } else { return ::mediapipe::InvalidArgumentError( "None of supported values were specified in options."); diff --git a/mediapipe/calculators/core/constant_side_packet_calculator.proto b/mediapipe/calculators/core/constant_side_packet_calculator.proto index 6b3feebdec..57f5dc5452 100644 --- a/mediapipe/calculators/core/constant_side_packet_calculator.proto +++ b/mediapipe/calculators/core/constant_side_packet_calculator.proto @@ -17,6 +17,9 @@ syntax = "proto2"; package mediapipe; import "mediapipe/framework/calculator.proto"; +import "mediapipe/framework/formats/classification.proto"; + +option objc_class_prefix = "MediaPipe"; message ConstantSidePacketCalculatorOptions { extend CalculatorOptions { @@ -30,6 +33,7 @@ message ConstantSidePacketCalculatorOptions { bool bool_value = 3; string string_value = 4; uint64 uint64_value = 5; + ClassificationList classification_list_value = 6; } } diff --git a/mediapipe/calculators/core/dequantize_byte_array_calculator.proto b/mediapipe/calculators/core/dequantize_byte_array_calculator.proto index 3032dbf48c..3af8e11ef6 100644 --- a/mediapipe/calculators/core/dequantize_byte_array_calculator.proto +++ b/mediapipe/calculators/core/dequantize_byte_array_calculator.proto @@ -18,6 +18,8 @@ package mediapipe; import "mediapipe/framework/calculator.proto"; +option objc_class_prefix = "MediaPipe"; + message DequantizeByteArrayCalculatorOptions { extend CalculatorOptions { optional DequantizeByteArrayCalculatorOptions ext = 272316343; diff --git a/mediapipe/calculators/core/gate_calculator.proto b/mediapipe/calculators/core/gate_calculator.proto index d9a1b69d46..76bacc74e7 100644 --- a/mediapipe/calculators/core/gate_calculator.proto +++ b/mediapipe/calculators/core/gate_calculator.proto @@ -18,6 +18,8 @@ package mediapipe; import "mediapipe/framework/calculator.proto"; +option objc_class_prefix = "MediaPipe"; + message GateCalculatorOptions { extend mediapipe.CalculatorOptions { optional GateCalculatorOptions ext = 261754847; diff --git a/mediapipe/calculators/core/packet_cloner_calculator.proto b/mediapipe/calculators/core/packet_cloner_calculator.proto index 7abb16163f..e30672fabf 100644 --- a/mediapipe/calculators/core/packet_cloner_calculator.proto +++ b/mediapipe/calculators/core/packet_cloner_calculator.proto @@ -18,6 +18,8 @@ package mediapipe; import "mediapipe/framework/calculator.proto"; +option objc_class_prefix = "MediaPipe"; + message PacketClonerCalculatorOptions { extend CalculatorOptions { optional PacketClonerCalculatorOptions ext = 258872085; diff --git a/mediapipe/calculators/core/packet_resampler_calculator.proto b/mediapipe/calculators/core/packet_resampler_calculator.proto index f23ce1fdcc..d037ee9ded 100644 --- a/mediapipe/calculators/core/packet_resampler_calculator.proto +++ b/mediapipe/calculators/core/packet_resampler_calculator.proto @@ -18,6 +18,8 @@ package mediapipe; import "mediapipe/framework/calculator.proto"; +option objc_class_prefix = "MediaPipe"; + message PacketResamplerCalculatorOptions { extend CalculatorOptions { optional PacketResamplerCalculatorOptions ext = 95743844; diff --git a/mediapipe/calculators/core/packet_thinner_calculator.proto b/mediapipe/calculators/core/packet_thinner_calculator.proto index 34fd9bc325..6c69f3afd6 100644 --- a/mediapipe/calculators/core/packet_thinner_calculator.proto +++ b/mediapipe/calculators/core/packet_thinner_calculator.proto @@ -18,6 +18,8 @@ package mediapipe; import "mediapipe/framework/calculator.proto"; +option objc_class_prefix = "MediaPipe"; + message PacketThinnerCalculatorOptions { extend CalculatorOptions { optional PacketThinnerCalculatorOptions ext = 288533508; diff --git a/mediapipe/calculators/core/quantize_float_vector_calculator.proto b/mediapipe/calculators/core/quantize_float_vector_calculator.proto index 3f6cfda21e..0ccc3c0d9c 100644 --- a/mediapipe/calculators/core/quantize_float_vector_calculator.proto +++ b/mediapipe/calculators/core/quantize_float_vector_calculator.proto @@ -18,6 +18,8 @@ package mediapipe; import "mediapipe/framework/calculator.proto"; +option objc_class_prefix = "MediaPipe"; + message QuantizeFloatVectorCalculatorOptions { extend CalculatorOptions { optional QuantizeFloatVectorCalculatorOptions ext = 259848061; diff --git a/mediapipe/calculators/core/sequence_shift_calculator.cc b/mediapipe/calculators/core/sequence_shift_calculator.cc index f2ab110257..425795fa22 100644 --- a/mediapipe/calculators/core/sequence_shift_calculator.cc +++ b/mediapipe/calculators/core/sequence_shift_calculator.cc @@ -32,6 +32,9 @@ class SequenceShiftCalculator : public CalculatorBase { public: static ::mediapipe::Status GetContract(CalculatorContract* cc) { cc->Inputs().Index(0).SetAny(); + if (cc->InputSidePackets().HasTag(kPacketOffsetTag)) { + cc->InputSidePackets().Tag(kPacketOffsetTag).Set(); + } cc->Outputs().Index(0).SetSameAs(&cc->Inputs().Index(0)); return ::mediapipe::OkStatus(); } @@ -41,6 +44,8 @@ class SequenceShiftCalculator : public CalculatorBase { ::mediapipe::Status Process(CalculatorContext* cc) override; private: + static constexpr const char* kPacketOffsetTag = "PACKET_OFFSET"; + // A positive offset means we want a packet to be output with the timestamp of // a later packet. Stores packets waiting for their output timestamps and // outputs a single packet when the cache fills. @@ -70,6 +75,9 @@ REGISTER_CALCULATOR(SequenceShiftCalculator); ::mediapipe::Status SequenceShiftCalculator::Open(CalculatorContext* cc) { packet_offset_ = cc->Options().packet_offset(); + if (cc->InputSidePackets().HasTag(kPacketOffsetTag)) { + packet_offset_ = cc->InputSidePackets().Tag(kPacketOffsetTag).Get(); + } cache_size_ = abs(packet_offset_); // An offset of zero is a no-op, but someone might still request it. if (packet_offset_ == 0) { diff --git a/mediapipe/calculators/core/sequence_shift_calculator.proto b/mediapipe/calculators/core/sequence_shift_calculator.proto index 15b111d71f..cdcd284ca2 100644 --- a/mediapipe/calculators/core/sequence_shift_calculator.proto +++ b/mediapipe/calculators/core/sequence_shift_calculator.proto @@ -18,6 +18,8 @@ package mediapipe; import "mediapipe/framework/calculator.proto"; +option objc_class_prefix = "MediaPipe"; + message SequenceShiftCalculatorOptions { extend CalculatorOptions { optional SequenceShiftCalculatorOptions ext = 107633927; diff --git a/mediapipe/calculators/core/sequence_shift_calculator_test.cc b/mediapipe/calculators/core/sequence_shift_calculator_test.cc index 1fee61daaa..23ad572257 100644 --- a/mediapipe/calculators/core/sequence_shift_calculator_test.cc +++ b/mediapipe/calculators/core/sequence_shift_calculator_test.cc @@ -99,6 +99,35 @@ TEST(SequenceShiftCalculatorTest, NegativeShift) { } } +// Tests using a side packet to specify the offset. Shifting by -2, i.e., +// output input[i] with timestamp[i - 2]. The first two packets should be +// dropped. +TEST(SequenceShiftCalculatorTest, SidePacketOffset) { + CalculatorGraphConfig::Node node; + node.set_calculator("SequenceShiftCalculator"); + node.add_input_stream("input"); + node.add_output_stream("output"); + node.add_input_side_packet("PACKET_OFFSET:packet_offset"); + + CalculatorRunner runner(node); + AddPackets(&runner); + runner.MutableSidePackets()->Tag("PACKET_OFFSET") = Adopt(new int(-2)); + MP_ASSERT_OK(runner.Run()); + const std::vector& input_packets = + runner.MutableInputs()->Index(0).packets; + const std::vector& output_packets = runner.Outputs().Index(0).packets; + ASSERT_EQ(10, input_packets.size()); + // Input packet[i] should be output with the timestamp of input packet[i - 2]. + // The first two packets are dropped. This means timestamps match between + // input and output packets, but the data in the output packets come from + // input_packets[i + 2]. + ASSERT_EQ(8, output_packets.size()); + for (int i = 0; i < output_packets.size(); ++i) { + EXPECT_EQ(input_packets[i].Timestamp(), output_packets[i].Timestamp()); + EXPECT_EQ(input_packets[i + 2].Get(), output_packets[i].Get()); + } +} + } // namespace } // namespace mediapipe diff --git a/mediapipe/calculators/core/split_vector_calculator.cc b/mediapipe/calculators/core/split_vector_calculator.cc index 100507c999..730a8e34e3 100644 --- a/mediapipe/calculators/core/split_vector_calculator.cc +++ b/mediapipe/calculators/core/split_vector_calculator.cc @@ -16,10 +16,12 @@ #include +#include "mediapipe/framework/formats/classification.pb.h" #include "mediapipe/framework/formats/detection.pb.h" #include "mediapipe/framework/formats/landmark.pb.h" #include "mediapipe/framework/formats/matrix.h" #include "mediapipe/framework/formats/rect.pb.h" +#include "mediapipe/framework/formats/tensor.h" #include "tensorflow/lite/interpreter.h" #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) @@ -46,15 +48,18 @@ typedef SplitVectorCalculator SplitTfLiteTensorVectorCalculator; REGISTER_CALCULATOR(SplitTfLiteTensorVectorCalculator); -typedef SplitVectorCalculator<::mediapipe::NormalizedLandmark, false> +typedef SplitVectorCalculator SplitTensorVectorCalculator; +REGISTER_CALCULATOR(SplitTensorVectorCalculator); + +typedef SplitVectorCalculator SplitLandmarkVectorCalculator; REGISTER_CALCULATOR(SplitLandmarkVectorCalculator); -typedef SplitVectorCalculator<::mediapipe::NormalizedLandmarkList, false> +typedef SplitVectorCalculator SplitNormalizedLandmarkListVectorCalculator; REGISTER_CALCULATOR(SplitNormalizedLandmarkListVectorCalculator); -typedef SplitVectorCalculator<::mediapipe::NormalizedRect, false> +typedef SplitVectorCalculator SplitNormalizedRectVectorCalculator; REGISTER_CALCULATOR(SplitNormalizedRectVectorCalculator); @@ -67,8 +72,12 @@ typedef SplitVectorCalculator<::tflite::gpu::gl::GlBuffer, true> REGISTER_CALCULATOR(MovableSplitGlBufferVectorCalculator); #endif -typedef SplitVectorCalculator<::mediapipe::Detection, false> +typedef SplitVectorCalculator SplitDetectionVectorCalculator; REGISTER_CALCULATOR(SplitDetectionVectorCalculator); +typedef SplitVectorCalculator + SplitClassificationListVectorCalculator; +REGISTER_CALCULATOR(SplitClassificationListVectorCalculator); + } // namespace mediapipe diff --git a/mediapipe/calculators/core/split_vector_calculator.proto b/mediapipe/calculators/core/split_vector_calculator.proto index 53acbb7bf5..40301f88b9 100644 --- a/mediapipe/calculators/core/split_vector_calculator.proto +++ b/mediapipe/calculators/core/split_vector_calculator.proto @@ -18,6 +18,8 @@ package mediapipe; import "mediapipe/framework/calculator.proto"; +option objc_class_prefix = "MediaPipe"; + // A Range {begin, end} specifies beginning ane ending indices to splice a // vector. A vector v is spliced to have elements v[begin:(end-1)], i.e., with // begin index inclusive and end index exclusive. diff --git a/mediapipe/calculators/image/bilateral_filter_calculator.cc b/mediapipe/calculators/image/bilateral_filter_calculator.cc index b366caf7ae..8d3d26f2dc 100644 --- a/mediapipe/calculators/image/bilateral_filter_calculator.cc +++ b/mediapipe/calculators/image/bilateral_filter_calculator.cc @@ -107,7 +107,7 @@ class BilateralFilterCalculator : public CalculatorBase { GLuint program_ = 0; GLuint vao_; GLuint vbo_[2]; // vertex storage -#endif // !MEDIAPIPE_DISABLE_GPU +#endif // !MEDIAPIPE_DISABLE_GPU }; REGISTER_CALCULATOR(BilateralFilterCalculator); diff --git a/mediapipe/calculators/image/image_transformation_calculator.cc b/mediapipe/calculators/image/image_transformation_calculator.cc index 37539d814d..3fb4e68074 100644 --- a/mediapipe/calculators/image/image_transformation_calculator.cc +++ b/mediapipe/calculators/image/image_transformation_calculator.cc @@ -519,7 +519,7 @@ ::mediapipe::Status ImageTransformationCalculator::RenderGpu( renderer = yuv_renderer_.get(); src1 = gpu_helper_.CreateSourceTexture(input, 0); } else // NOLINT(readability/braces) -#endif // iOS +#endif // iOS { src1 = gpu_helper_.CreateSourceTexture(input); #if defined(TEXTURE_EXTERNAL_OES) @@ -531,7 +531,7 @@ ::mediapipe::Status ImageTransformationCalculator::RenderGpu( } renderer = ext_rgb_renderer_.get(); } else // NOLINT(readability/braces) -#endif // TEXTURE_EXTERNAL_OES +#endif // TEXTURE_EXTERNAL_OES { if (!rgb_renderer_) { rgb_renderer_ = absl::make_unique(); diff --git a/mediapipe/calculators/tensor/BUILD b/mediapipe/calculators/tensor/BUILD new file mode 100644 index 0000000000..f236413b6e --- /dev/null +++ b/mediapipe/calculators/tensor/BUILD @@ -0,0 +1,631 @@ +# Copyright 2019 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +load("@bazel_skylib//lib:selects.bzl", "selects") +load("//mediapipe/framework/port:build_config.bzl", "mediapipe_proto_library") + +licenses(["notice"]) + +package(default_visibility = ["//visibility:private"]) + +selects.config_setting_group( + name = "compute_shader_unavailable", + match_any = [ + "//mediapipe/gpu:disable_gpu", + ], +) + +mediapipe_proto_library( + name = "inference_calculator_proto", + srcs = ["inference_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_options_proto", + "//mediapipe/framework:calculator_proto", + ], +) + +cc_library( + name = "inference_calculator", + srcs = ["inference_calculator.cc"], + copts = select({ + "//mediapipe:apple": [ + "-x objective-c++", + "-fobjc-arc", # enable reference-counting + ], + "//conditions:default": [], + }), + features = ["-layering_check"], # allow depending on inference_calculator_gpu_deps + linkopts = select({ + "//mediapipe:apple": [ + "-framework CoreVideo", + "-framework MetalKit", + ], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], + deps = [ + ":inference_calculator_cc_proto", + "@com_google_absl//absl/memory", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:tensor", + "//mediapipe/util:resource_util", + "//mediapipe/util/tflite:config", + "@org_tensorflow//tensorflow/lite:framework", + "@org_tensorflow//tensorflow/lite/delegates/xnnpack:xnnpack_delegate", + "@org_tensorflow//tensorflow/lite/kernels:builtin_ops", + "//mediapipe/framework/stream_handler:fixed_size_input_stream_handler", + "//mediapipe/framework/port:ret_check", + ] + select({ + ":compute_shader_unavailable": [], + "//conditions:default": [":inference_calculator_gpu_deps"], + }) + select({ + "//conditions:default": [], + "//mediapipe:android": [ + "//mediapipe/util/android/file/base", + "@org_tensorflow//tensorflow/lite/delegates/nnapi:nnapi_delegate", + ], + }) + select({ + "//conditions:default": [ + "//mediapipe/util:cpu_util", + ], + }), + alwayslink = 1, +) + +cc_library( + name = "inference_calculator_gpu_deps", + deps = selects.with_or({ + "//mediapipe:ios": [ + "//mediapipe/gpu:MPPMetalHelper", + "//mediapipe/gpu:MPPMetalUtil", + "//mediapipe/gpu:gpu_buffer", + "//mediapipe/objc:mediapipe_framework_ios", + "@org_tensorflow//tensorflow/lite/delegates/gpu/common:shape", + "@org_tensorflow//tensorflow/lite/delegates/gpu/metal:buffer_convert", + "@org_tensorflow//tensorflow/lite/delegates/gpu:metal_delegate", + "@org_tensorflow//tensorflow/lite/delegates/gpu:metal_delegate_internal", + ], + "//mediapipe:macos": [], + "//conditions:default": [ + "//mediapipe/util/tflite:tflite_gpu_runner", + "//mediapipe/gpu:gl_calculator_helper", + "//mediapipe/gpu:gpu_buffer", + "@org_tensorflow//tensorflow/lite/delegates/gpu/common:shape", + "@org_tensorflow//tensorflow/lite/delegates/gpu:gl_delegate", + "@org_tensorflow//tensorflow/lite/delegates/gpu/gl:gl_buffer", + "@org_tensorflow//tensorflow/lite/delegates/gpu/gl:gl_program", + "@org_tensorflow//tensorflow/lite/delegates/gpu/gl:gl_shader", + ], + }), +) + +mediapipe_proto_library( + name = "tensor_converter_calculator_proto", + srcs = ["tensor_converter_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_options_proto", + "//mediapipe/framework:calculator_proto", + ], +) + +cc_library( + name = "tensor_converter_calculator", + srcs = ["tensor_converter_calculator.cc"], + copts = select({ + "//mediapipe:apple": [ + "-x objective-c++", + "-fobjc-arc", # enable reference-counting + ], + "//conditions:default": [], + }), + features = ["-layering_check"], # allow depending on tensor_converter_calculator_gpu_deps + linkopts = select({ + "//mediapipe:apple": [ + "-framework CoreVideo", + "-framework MetalKit", + ], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], + deps = [ + ":tensor_converter_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:image_frame", + "//mediapipe/framework/formats:matrix", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework:port", + "//mediapipe/util:resource_util", + ] + select({ + "//mediapipe/gpu:disable_gpu": [], + "//conditions:default": ["tensor_converter_calculator_gpu_deps"], + }), + alwayslink = 1, +) + +cc_library( + name = "tensor_converter_calculator_gpu_deps", + deps = select({ + "//mediapipe:android": [ + "//mediapipe/gpu:gl_calculator_helper", + "//mediapipe/gpu:gpu_buffer", + ], + "//mediapipe:ios": [ + "//mediapipe/gpu:MPPMetalUtil", + "//mediapipe/gpu:MPPMetalHelper", + "//mediapipe/objc:mediapipe_framework_ios", + ], + "//mediapipe:macos": [], + "//conditions:default": [ + "//mediapipe/gpu:gl_calculator_helper", + "//mediapipe/gpu:gl_simple_shaders", + "//mediapipe/gpu:shader_util", + "//mediapipe/gpu:gpu_buffer", + ], + }), +) + +cc_test( + name = "tensor_converter_calculator_test", + srcs = ["tensor_converter_calculator_test.cc"], + deps = [ + ":tensor_converter_calculator", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework:calculator_runner", + "//mediapipe/framework/formats:image_format_cc_proto", + "//mediapipe/framework/formats:image_frame", + "//mediapipe/framework/formats:image_frame_opencv", + "//mediapipe/framework/formats:matrix", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/port:gtest_main", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:parse_text_proto", + "//mediapipe/framework/tool:validate_type", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + ], +) + +mediapipe_proto_library( + name = "tensors_to_detections_calculator_proto", + srcs = ["tensors_to_detections_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_options_proto", + "//mediapipe/framework:calculator_proto", + ], +) + +cc_library( + name = "tensors_to_detections_calculator", + srcs = ["tensors_to_detections_calculator.cc"], + copts = select({ + "//mediapipe:apple": [ + "-x objective-c++", + "-fobjc-arc", # enable reference-counting + ], + "//conditions:default": [], + }), + features = ["-layering_check"], # allow depending on tensors_to_detections_calculator_gpu_deps + linkopts = select({ + "//mediapipe:apple": [ + "-framework CoreVideo", + "-framework MetalKit", + ], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], + deps = [ + ":tensors_to_detections_calculator_cc_proto", + "//mediapipe/framework/formats:detection_cc_proto", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework:port", + "//mediapipe/framework/deps:file_path", + "//mediapipe/framework/formats:location", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/formats/object_detection:anchor_cc_proto", + "//mediapipe/framework/port:ret_check", + ] + select({ + ":compute_shader_unavailable": [], + "//conditions:default": [":tensors_to_detections_calculator_gpu_deps"], + }), + alwayslink = 1, +) + +cc_library( + name = "tensors_to_detections_calculator_gpu_deps", + deps = select({ + "//mediapipe:ios": [ + "//mediapipe/gpu:MPPMetalUtil", + "//mediapipe/gpu:MPPMetalHelper", + ], + "//mediapipe:macos": [], + "//conditions:default": [ + "//mediapipe/gpu:gl_calculator_helper", + ], + }), +) + +mediapipe_proto_library( + name = "tensors_to_landmarks_calculator_proto", + srcs = ["tensors_to_landmarks_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_options_proto", + "//mediapipe/framework:calculator_proto", + ], +) + +cc_library( + name = "tensors_to_landmarks_calculator", + srcs = ["tensors_to_landmarks_calculator.cc"], + copts = select({ + "//mediapipe:apple": [ + "-x objective-c++", + "-fobjc-arc", # enable reference-counting + ], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], + deps = [ + ":tensors_to_landmarks_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:landmark_cc_proto", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/port:ret_check", + ], + alwayslink = 1, +) + +cc_library( + name = "tensors_to_floats_calculator", + srcs = ["tensors_to_floats_calculator.cc"], + copts = select({ + "//mediapipe:apple": [ + "-x objective-c++", + "-fobjc-arc", # enable reference-counting + ], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/port:ret_check", + ], + alwayslink = 1, +) + +cc_library( + name = "tensors_to_classification_calculator", + srcs = ["tensors_to_classification_calculator.cc"], + copts = select({ + "//mediapipe:apple": [ + "-x objective-c++", + "-fobjc-arc", # enable reference-counting + ], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], + deps = [ + ":tensors_to_classification_calculator_cc_proto", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + "//mediapipe/framework/formats:classification_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:location", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/formats:tensor", + "//mediapipe/util:resource_util", + ] + select({ + "//mediapipe:android": [ + "//mediapipe/util/android/file/base", + ], + "//mediapipe:ios": [ + "//mediapipe/util/android/file/base", + ], + "//conditions:default": [ + "//mediapipe/framework/port:file_helpers", + ], + }), + alwayslink = 1, +) + +mediapipe_proto_library( + name = "tensors_to_classification_calculator_proto", + srcs = ["tensors_to_classification_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_options_proto", + "//mediapipe/framework:calculator_proto", + ], +) + +cc_test( + name = "tensors_to_classification_calculator_test", + srcs = ["tensors_to_classification_calculator_test.cc"], + data = ["testdata/labelmap.txt"], + deps = [ + ":tensors_to_classification_calculator", + ":tensors_to_classification_calculator_cc_proto", + "//mediapipe/framework:calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework:calculator_runner", + "//mediapipe/framework/formats:classification_cc_proto", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/port:gtest_main", + "//mediapipe/framework/port:parse_text_proto", + "@com_google_absl//absl/memory", + "@com_google_googletest//:gtest_main", + ], +) + +cc_library( + name = "image_to_tensor_calculator", + srcs = ["image_to_tensor_calculator.cc"], + copts = select({ + "//mediapipe:apple": [ + "-x objective-c++", + "-fobjc-arc", # enable reference-counting + ], + "//conditions:default": [], + }), + features = ["-layering_check"], # allow depending on image_to_tensor_calculator_gpu_deps + visibility = ["//visibility:public"], + deps = [ + ":image_to_tensor_calculator_cc_proto", + ":image_to_tensor_converter", + ":image_to_tensor_converter_opencv", + ":image_to_tensor_utils", + "//mediapipe/framework/formats:image_frame", + "//mediapipe/framework/formats:rect_cc_proto", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "//mediapipe/framework/port:statusor", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework:port", + ] + select({ + "//mediapipe/gpu:disable_gpu": [], + "//conditions:default": [":image_to_tensor_calculator_gpu_deps"], + }), + alwayslink = 1, +) + +cc_library( + name = "image_to_tensor_calculator_gpu_deps", + deps = select({ + "//mediapipe:android": [ + ":image_to_tensor_converter_gl_buffer", + "//mediapipe/gpu:gl_calculator_helper", + "//mediapipe/gpu:gpu_buffer", + ], + "//mediapipe:apple": [ + ":image_to_tensor_converter_metal", + "//mediapipe/gpu:MPPMetalHelper", + "//mediapipe/gpu:gpu_buffer", + ], + "//conditions:default": [ + ":image_to_tensor_converter_gl_buffer", + "//mediapipe/gpu:gl_calculator_helper", + "//mediapipe/gpu:gpu_buffer", + ], + }), +) + +mediapipe_proto_library( + name = "image_to_tensor_calculator_proto", + srcs = ["image_to_tensor_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_options_proto", + "//mediapipe/framework:calculator_proto", + ], +) + +cc_test( + name = "image_to_tensor_calculator_test", + srcs = ["image_to_tensor_calculator_test.cc"], + data = [ + "testdata/image_to_tensor/input.jpg", + "testdata/image_to_tensor/large_sub_rect.png", + "testdata/image_to_tensor/large_sub_rect_keep_aspect.png", + "testdata/image_to_tensor/large_sub_rect_keep_aspect_with_rotation.png", + "testdata/image_to_tensor/medium_sub_rect_keep_aspect.png", + "testdata/image_to_tensor/medium_sub_rect_keep_aspect_with_rotation.png", + "testdata/image_to_tensor/medium_sub_rect_with_rotation.png", + "testdata/image_to_tensor/noop_except_range.png", + ], + deps = [ + ":image_to_tensor_calculator", + ":image_to_tensor_utils", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework:calculator_runner", + "//mediapipe/framework/deps:file_path", + "//mediapipe/framework/formats:image_format_cc_proto", + "//mediapipe/framework/formats:image_frame", + "//mediapipe/framework/formats:image_frame_opencv", + "//mediapipe/framework/formats:rect_cc_proto", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/port:gtest_main", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_imgcodecs", + "//mediapipe/framework/port:opencv_imgproc", + "//mediapipe/framework/port:parse_text_proto", + "//mediapipe/framework/tool:validate_type", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@org_tensorflow//tensorflow/lite:framework", + ], +) + +cc_library( + name = "image_to_tensor_converter", + hdrs = ["image_to_tensor_converter.h"], + deps = [ + ":image_to_tensor_utils", + "//mediapipe/framework:packet", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/port:statusor", + ], +) + +cc_library( + name = "image_to_tensor_converter_opencv", + srcs = ["image_to_tensor_converter_opencv.cc"], + hdrs = ["image_to_tensor_converter_opencv.h"], + copts = select({ + "//mediapipe:apple": [ + "-x objective-c++", + "-fobjc-arc", # enable reference-counting + ], + "//conditions:default": [], + }), + deps = [ + ":image_to_tensor_converter", + ":image_to_tensor_utils", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:image_format_cc_proto", + "//mediapipe/framework/formats:image_frame", + "//mediapipe/framework/formats:image_frame_opencv", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_imgproc", + "//mediapipe/framework/port:status", + "//mediapipe/framework/port:statusor", + ], +) + +cc_library( + name = "image_to_tensor_converter_gl_buffer", + srcs = ["image_to_tensor_converter_gl_buffer.cc"], + hdrs = ["image_to_tensor_converter_gl_buffer.h"], + deps = ["//mediapipe/framework:port"] + select({ + "//mediapipe:apple": [], + "//conditions:default": [ + ":image_to_tensor_converter", + ":image_to_tensor_utils", + "@com_google_absl//absl/strings", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:rect_cc_proto", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "//mediapipe/framework/port:statusor", + "//mediapipe/gpu:gl_calculator_helper", + "//mediapipe/gpu:gpu_buffer", + "//mediapipe/gpu:gpu_buffer_format", + "@org_tensorflow//tensorflow/lite/delegates/gpu/common:shape", + "@org_tensorflow//tensorflow/lite/delegates/gpu/common:types", + "@org_tensorflow//tensorflow/lite/delegates/gpu/gl:command_queue", + "@org_tensorflow//tensorflow/lite/delegates/gpu/gl:gl_buffer", + "@org_tensorflow//tensorflow/lite/delegates/gpu/gl:gl_call", + "@org_tensorflow//tensorflow/lite/delegates/gpu/gl:gl_texture", + "@org_tensorflow//tensorflow/lite/delegates/gpu/gl:request_gpu_info", + "@org_tensorflow//tensorflow/lite/delegates/gpu/gl:variable", + "@org_tensorflow//tensorflow/lite/delegates/gpu/gl/converters:util", + ], + }), +) + +cc_library( + name = "image_to_tensor_converter_gl_texture", + srcs = ["image_to_tensor_converter_gl_texture.cc"], + hdrs = ["image_to_tensor_converter_gl_texture.h"], + deps = ["//mediapipe/framework:port"] + select({ + "//mediapipe/gpu:disable_gpu": [], + "//conditions:default": [ + ":image_to_tensor_converter", + ":image_to_tensor_utils", + "@com_google_absl//absl/strings", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "//mediapipe/framework/port:statusor", + "//mediapipe/gpu:gl_calculator_helper", + "//mediapipe/gpu:gl_simple_shaders", + "//mediapipe/gpu:gpu_buffer", + "//mediapipe/gpu:shader_util", + ], + }), +) + +cc_library( + name = "image_to_tensor_converter_metal", + srcs = ["image_to_tensor_converter_metal.cc"], + hdrs = ["image_to_tensor_converter_metal.h"], + copts = select({ + "//mediapipe:apple": [ + "-x objective-c++", + "-fobjc-arc", # enable reference-counting + ], + "//conditions:default": [], + }), + linkopts = select({ + "//mediapipe:apple": [ + "-framework CoreVideo", + "-framework MetalKit", + ], + "//conditions:default": [], + }), + deps = ["//mediapipe/framework:port"] + select({ + "//mediapipe:apple": [ + ":image_to_tensor_converter", + ":image_to_tensor_utils", + "//mediapipe/gpu:MPPMetalHelper", + "@com_google_absl//absl/strings", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:rect_cc_proto", + "//mediapipe/framework/formats:tensor", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "//mediapipe/framework/port:statusor", + "//mediapipe/gpu:gpu_buffer", + "//mediapipe/gpu:gpu_buffer_format", + "@org_tensorflow//tensorflow/lite/delegates/gpu/common:shape", + "@org_tensorflow//tensorflow/lite/delegates/gpu/common:types", + ], + "//conditions:default": [], + }), +) + +cc_library( + name = "image_to_tensor_utils", + srcs = ["image_to_tensor_utils.cc"], + hdrs = ["image_to_tensor_utils.h"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework/formats:rect_cc_proto", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:statusor", + "@com_google_absl//absl/types:optional", + ], +) + +cc_test( + name = "image_to_tensor_utils_test", + srcs = ["image_to_tensor_utils_test.cc"], + deps = [ + ":image_to_tensor_utils", + "//mediapipe/framework/formats:rect_cc_proto", + "//mediapipe/framework/port:gtest_main", + ], +) diff --git a/mediapipe/calculators/tensor/image_to_tensor_calculator.cc b/mediapipe/calculators/tensor/image_to_tensor_calculator.cc new file mode 100644 index 0000000000..9f8c2b0235 --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_calculator.cc @@ -0,0 +1,275 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "mediapipe/calculators/tensor/image_to_tensor_calculator.pb.h" +#include "mediapipe/calculators/tensor/image_to_tensor_converter.h" +#include "mediapipe/calculators/tensor/image_to_tensor_converter_opencv.h" +#include "mediapipe/calculators/tensor/image_to_tensor_utils.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/image_frame.h" +#include "mediapipe/framework/formats/rect.pb.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port.h" +#include "mediapipe/framework/port/canonical_errors.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" + +#if !MEDIAPIPE_DISABLE_GPU +#include "mediapipe/gpu/gpu_buffer.h" + +#if MEDIAPIPE_METAL_ENABLED +#include "mediapipe/calculators/tensor/image_to_tensor_converter_metal.h" +#include "mediapipe/gpu/MPPMetalHelper.h" +#elif MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 +#include "mediapipe/calculators/tensor/image_to_tensor_converter_gl_buffer.h" +#include "mediapipe/gpu/gl_calculator_helper.h" +#else +#include "mediapipe/calculators/tensor/image_to_tensor_converter_gl_texture.h" +#include "mediapipe/gpu/gl_calculator_helper.h" +#endif // MEDIAPIPE_METAL_ENABLED + +#endif // !MEDIAPIPE_DISABLE_GPU + +namespace { +constexpr char kInputCpu[] = "IMAGE"; +constexpr char kInputGpu[] = "IMAGE_GPU"; +constexpr char kOutputMatrix[] = "MATRIX"; +constexpr char kOutput[] = "TENSORS"; +constexpr char kInputNormRect[] = "NORM_RECT"; +constexpr char kOutputLetterboxPadding[] = "LETTERBOX_PADDING"; +} // namespace + +namespace mediapipe { + +// Converts image into Tensor, possibly with cropping, resizing and +// normalization, according to specified inputs and options. +// +// Inputs: +// IMAGE - ImageFrame [ImageFormat::SRGB/SRGBA] +// Image to extract from. +// IMAGE_GPU - GpuBuffer [GpuBufferFormat::kBGRA32] +// Image to extract from. +// (Either IMAGE or IMAGE_GPU has to be specified.) +// +// NORM_RECT - NormalizedRect @Optional +// Describes region of image to extract. +// @Optional: rect covering the whole image is used if not specified. +// +// Outputs: +// TENSORS - std::vector +// Vector containing a single Tensor populated with an extrated RGB image. +// MATRIX - std::array @Optional +// An std::array representing a 4x4 row-major-order matrix which +// can be used to map a point on the output tensor to a point on the input +// image. +// LETTERBOX_PADDING - std::array @Optional +// An std::array representing the letterbox padding from the 4 +// sides ([left, top, right, bottom]) of the output image, normalized to +// [0.f, 1.f] by the output dimensions. The padding values are non-zero only +// when the "keep_aspect_ratio" is true. +// +// For instance, when the input image is 10x10 (width x height) and the +// output dimensions specified in the calculator option are 20x40 and +// "keep_aspect_ratio" is true, the calculator scales the input image to +// 20x20 and places it in the middle of the output image with an equal +// padding of 10 pixels at the top and the bottom. The resulting array is +// therefore [0.f, 0.25f, 0.f, 0.25f] (10/40 = 0.25f). +// +// Example: +// node { +// calculator: "ImageToTensorCalculator" +// input_stream: "IMAGE:image" # or "IMAGE_GPU:image" +// input_stream: "NORM_RECT:roi" +// output_stream: "TENSORS:tensors" +// output_stream: "MATRIX:matrix" +// options { +// [mediapipe.ImageToTensorCalculatorOptions.ext] { +// output_tensor_width: 256 +// output_tensor_height: 256 +// keep_aspect_ratio: false +// output_tensor_float_range { +// min: 0.0 +// max: 1.0 +// } +// # gpu_origin: CONVENTIONAL # or TOP_LEFT +// } +// } +// } +class ImageToTensorCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc) { + const auto& options = + cc->Options(); + + RET_CHECK(options.has_output_tensor_float_range()) + << "Output tensor range is required."; + RET_CHECK_LT(options.output_tensor_float_range().min(), + options.output_tensor_float_range().max()) + << "Valid output tensor range is required."; + RET_CHECK_GT(options.output_tensor_width(), 0) + << "Valid output tensor width is required."; + RET_CHECK_GT(options.output_tensor_height(), 0) + << "Valid output tensor height is required."; + + if (cc->Inputs().HasTag(kInputNormRect)) { + cc->Inputs().Tag(kInputNormRect).Set(); + } + if (cc->Outputs().HasTag(kOutputLetterboxPadding)) { + cc->Outputs().Tag(kOutputLetterboxPadding).Set>(); + } + if (cc->Outputs().HasTag(kOutputMatrix)) { + cc->Outputs().Tag(kOutputMatrix).Set>(); + } + + const bool has_cpu_input = cc->Inputs().HasTag(kInputCpu); + const bool has_gpu_input = cc->Inputs().HasTag(kInputGpu); + RET_CHECK_EQ((has_cpu_input ? 1 : 0) + (has_gpu_input ? 1 : 0), 1) + << "Either CPU or GPU input is expected, not both."; + + if (has_cpu_input) { + cc->Inputs().Tag(kInputCpu).Set(); + } else if (has_gpu_input) { +#if MEDIAPIPE_DISABLE_GPU + return mediapipe::UnimplementedError("GPU processing is disabled"); +#else + +#if MEDIAPIPE_METAL_ENABLED + MP_RETURN_IF_ERROR([MPPMetalHelper updateContract:cc]); +#else + MP_RETURN_IF_ERROR(mediapipe::GlCalculatorHelper::UpdateContract(cc)); +#endif // MEDIAPIPE_METAL_ENABLED + cc->Inputs().Tag(kInputGpu).Set(); + +#endif // MEDIAPIPE_DISABLE_GPU + } + cc->Outputs().Tag(kOutput).Set>(); + return ::mediapipe::OkStatus(); + } + + ::mediapipe::Status Open(CalculatorContext* cc) { + // Makes sure outputs' next timestamp bound update is handled automatically + // by the framework. + cc->SetOffset(TimestampDiff(0)); + options_ = cc->Options(); + output_width_ = options_.output_tensor_width(); + output_height_ = options_.output_tensor_height(); + range_min_ = options_.output_tensor_float_range().min(); + range_max_ = options_.output_tensor_float_range().max(); + + if (cc->Inputs().HasTag(kInputCpu)) { + ASSIGN_OR_RETURN(converter_, CreateOpenCvConverter(cc)); + } else { +#if MEDIAPIPE_DISABLE_GPU + return mediapipe::UnimplementedError("GPU processing is disabled"); +#else + +#if MEDIAPIPE_METAL_ENABLED + ASSIGN_OR_RETURN(converter_, CreateMetalConverter(cc)); +#elif MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + ASSIGN_OR_RETURN(converter_, CreateImageToGlBufferTensorConverter( + cc, DoesInputStartAtBottom())); +#else + ASSIGN_OR_RETURN(converter_, CreateImageToGlTextureTensorConverter( + cc, DoesInputStartAtBottom())); +#endif // MEDIAPIPE_METAL_ENABLED + +#endif // MEDIAPIPE_DISABLE_GPU + } + return ::mediapipe::OkStatus(); + } + + ::mediapipe::Status Process(CalculatorContext* cc) { + const InputStreamShard& input = cc->Inputs().Tag( + cc->Inputs().HasTag(kInputCpu) ? kInputCpu : kInputGpu); + if (input.IsEmpty()) { + // Timestamp bound update happens automatically. (See Open().) + return ::mediapipe::OkStatus(); + } + + absl::optional norm_rect; + if (cc->Inputs().HasTag(kInputNormRect)) { + if (cc->Inputs().Tag(kInputNormRect).IsEmpty()) { + // Timestamp bound update happens automatically. (See Open().) + return ::mediapipe::OkStatus(); + } + norm_rect = + cc->Inputs().Tag(kInputNormRect).Get(); + if (norm_rect->width() == 0 && norm_rect->height() == 0) { + // WORKAROUND: some existing graphs may use sentinel rects {width=0, + // height=0, ...} quite often and calculator has to handle them + // gracefully by updating timestamp bound instead of returning failure. + // Timestamp bound update happens automatically. (See Open().) + // NOTE: usage of sentinel rects should be avoided. + DLOG(WARNING) + << "Updating timestamp bound in response to a sentinel rect"; + return ::mediapipe::OkStatus(); + } + } + + const Packet& image_packet = input.Value(); + const Size& size = converter_->GetImageSize(image_packet); + RotatedRect roi = GetRoi(size.width, size.height, norm_rect); + ASSIGN_OR_RETURN(auto padding, PadRoi(options_.output_tensor_width(), + options_.output_tensor_height(), + options_.keep_aspect_ratio(), &roi)); + if (cc->Outputs().HasTag(kOutputLetterboxPadding)) { + cc->Outputs() + .Tag(kOutputLetterboxPadding) + .AddPacket(MakePacket>(padding).At( + cc->InputTimestamp())); + } + if (cc->Outputs().HasTag(kOutputMatrix)) { + std::array matrix; + GetRotatedSubRectToRectTransformMatrix(roi, size.width, size.height, + /*flip_horizontaly=*/false, + &matrix); + cc->Outputs() + .Tag(kOutputMatrix) + .AddPacket(MakePacket>(std::move(matrix)) + .At(cc->InputTimestamp())); + } + + ASSIGN_OR_RETURN( + Tensor tensor, + converter_->Convert(image_packet, roi, {output_width_, output_height_}, + range_min_, range_max_)); + + std::vector result; + result.push_back(std::move(tensor)); + cc->Outputs().Tag(kOutput).AddPacket( + MakePacket>(std::move(result)) + .At(cc->InputTimestamp())); + + return ::mediapipe::OkStatus(); + } + + private: + bool DoesInputStartAtBottom() { + return options_.gpu_origin() != mediapipe::GpuOrigin_Mode_TOP_LEFT; + } + + std::unique_ptr converter_; + mediapipe::ImageToTensorCalculatorOptions options_; + int output_width_ = 0; + int output_height_ = 0; + float range_min_ = 0.0f; + float range_max_ = 1.0f; +}; + +REGISTER_CALCULATOR(ImageToTensorCalculator); + +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/image_to_tensor_calculator.proto b/mediapipe/calculators/tensor/image_to_tensor_calculator.proto new file mode 100644 index 0000000000..038952a01e --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_calculator.proto @@ -0,0 +1,64 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message GpuOrigin { + enum Mode { + DEFAULT = 0; + + // OpenGL: bottom-left origin + // Metal : top-left origin + CONVENTIONAL = 1; + + // OpenGL: top-left origin + // Metal : top-left origin + TOP_LEFT = 2; + } +} + +message ImageToTensorCalculatorOptions { + extend mediapipe.CalculatorOptions { + optional ImageToTensorCalculatorOptions ext = 334361939; + } + + // Range of float values [min, max]. + // min, must be strictly less than max. + message FloatRange { + optional float min = 1; + optional float max = 2; + } + + optional int32 output_tensor_width = 1; + optional int32 output_tensor_height = 2; + + // If true, image region will be extracted and copied into tensor keeping + // region aspect ratio, which usually results in letterbox padding. Otherwise, + // if false, image region is stretched to fill output tensor fully. + optional bool keep_aspect_ratio = 3; + + // Output tensor element range/type image pixels are converted to. + oneof range { + FloatRange output_tensor_float_range = 4; + } + + // For CONVENTIONAL mode for OpenGL, input image starts at bottom and needs + // to be flipped vertically as tensors are expected to start at top. + // (DEFAULT or unset interpreted as CONVENTIONAL.) + optional GpuOrigin.Mode gpu_origin = 5; +} diff --git a/mediapipe/calculators/tensor/image_to_tensor_calculator_test.cc b/mediapipe/calculators/tensor/image_to_tensor_calculator_test.cc new file mode 100644 index 0000000000..6c5162a956 --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_calculator_test.cc @@ -0,0 +1,262 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "absl/memory/memory.h" +#include "absl/strings/substitute.h" +#include "mediapipe/calculators/tensor/image_to_tensor_utils.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/calculator_runner.h" +#include "mediapipe/framework/deps/file_path.h" +#include "mediapipe/framework/formats/image_format.pb.h" +#include "mediapipe/framework/formats/image_frame.h" +#include "mediapipe/framework/formats/image_frame_opencv.h" +#include "mediapipe/framework/formats/rect.pb.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port/gtest.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/framework/port/opencv_imgcodecs_inc.h" +#include "mediapipe/framework/port/opencv_imgproc_inc.h" +#include "mediapipe/framework/port/parse_text_proto.h" +#include "mediapipe/framework/port/status_matchers.h" + +namespace mediapipe { +namespace { + +cv::Mat GetRgb(absl::string_view path) { + cv::Mat bgr = cv::imread(file::JoinPath("./", path)); + cv::Mat rgb; + cv::cvtColor(bgr, rgb, cv::COLOR_BGR2RGB); + return rgb; +} + +cv::Mat GetRgba(absl::string_view path) { + cv::Mat bgr = cv::imread(file::JoinPath("./", path)); + cv::Mat rgb; + cv::cvtColor(bgr, rgb, cv::COLOR_BGR2RGBA); + return rgb; +} + +// Image to tensor test template. +// No processing/assertions should be done after the function is invoked. +void RunTest(cv::Mat input, cv::Mat expected_result, float range_min, + float range_max, int tensor_width, int tensor_height, + bool keep_aspect, const mediapipe::NormalizedRect& roi) { + auto graph_config = mediapipe::ParseTextProtoOrDie( + absl::Substitute(R"( + input_stream: "input_image" + input_stream: "roi" + node { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE:input_image" + input_stream: "NORM_RECT:roi" + output_stream: "TENSORS:tensor" + options { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: $0 + output_tensor_height: $1 + keep_aspect_ratio: $4 + output_tensor_float_range { + min: $2 + max: $3 + } + } + } + } + )", + /*$0=*/tensor_width, + /*$1=*/tensor_height, + /*$2=*/range_min, + /*$3=*/range_max, + /*$4=*/keep_aspect ? "true" : "false")); + + std::vector output_packets; + tool::AddVectorSink("tensor", &graph_config, &output_packets); + + // Run the graph. + CalculatorGraph graph; + MP_ASSERT_OK(graph.Initialize(graph_config)); + MP_ASSERT_OK(graph.StartRun({})); + + ImageFrame input_image( + input.channels() == 4 ? ImageFormat::SRGBA : ImageFormat::SRGB, + input.cols, input.rows, input.step, input.data, [](uint8*) {}); + MP_ASSERT_OK(graph.AddPacketToInputStream( + "input_image", + MakePacket(std::move(input_image)).At(Timestamp(0)))); + MP_ASSERT_OK(graph.AddPacketToInputStream( + "roi", + MakePacket(std::move(roi)).At(Timestamp(0)))); + + MP_ASSERT_OK(graph.WaitUntilIdle()); + ASSERT_THAT(output_packets, testing::SizeIs(1)); + + // Get and process results. + const std::vector& tensor_vec = + output_packets[0].Get>(); + ASSERT_THAT(tensor_vec, testing::SizeIs(1)); + + const Tensor& tensor = tensor_vec[0]; + EXPECT_EQ(tensor.element_type(), Tensor::ElementType::kFloat32); + + auto view = tensor.GetCpuReadView(); + cv::Mat tensor_mat(tensor_height, tensor_width, CV_32FC3, + const_cast(view.buffer())); + cv::Mat result_rgb; + auto transformation = + GetValueRangeTransformation(range_min, range_max, 0.0f, 255.0f) + .ValueOrDie(); + tensor_mat.convertTo(result_rgb, CV_8UC3, transformation.scale, + transformation.offset); + + cv::Mat diff; + cv::absdiff(result_rgb, expected_result, diff); + double max_val; + cv::minMaxLoc(diff, nullptr, &max_val); + // Expects the maximum absolute pixel-by-pixel difference is less than 5. + EXPECT_LE(max_val, 5); + + // Fully close graph at end, otherwise calculator+tensors are destroyed + // after calling WaitUntilDone(). + MP_ASSERT_OK(graph.CloseInputStream("input_image")); + MP_ASSERT_OK(graph.CloseInputStream("roi")); + MP_ASSERT_OK(graph.WaitUntilDone()); +} + +TEST(ImageToTensorCalculatorTest, MediumSubRectKeepAspect) { + mediapipe::NormalizedRect roi; + roi.set_x_center(0.65f); + roi.set_y_center(0.4f); + roi.set_width(0.5f); + roi.set_height(0.5f); + roi.set_rotation(0); + RunTest( + GetRgb("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/input.jpg"), + GetRgb("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/medium_sub_rect_keep_aspect.png"), + /*range_min=*/0.0f, + /*range_max=*/1.0f, + /*tensor_width=*/256, /*tensor_height=*/256, /*keep_aspect=*/true, roi); +} + +TEST(ImageToTensorCalculatorTest, MediumSubRectKeepAspectWithRotation) { + mediapipe::NormalizedRect roi; + roi.set_x_center(0.65f); + roi.set_y_center(0.4f); + roi.set_width(0.5f); + roi.set_height(0.5f); + roi.set_rotation(M_PI * 90.0f / 180.0f); + RunTest(GetRgb("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/input.jpg"), + GetRgb("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/" + "medium_sub_rect_keep_aspect_with_rotation.png"), + /*range_min=*/0.0f, /*range_max=*/1.0f, + /*tensor_width=*/256, /*tensor_height=*/256, /*keep_aspect=*/true, + roi); +} + +TEST(ImageToTensorCalculatorTest, MediumSubRectWithRotation) { + mediapipe::NormalizedRect roi; + roi.set_x_center(0.65f); + roi.set_y_center(0.4f); + roi.set_width(0.5f); + roi.set_height(0.5f); + roi.set_rotation(M_PI * -45.0f / 180.0f); + RunTest( + GetRgb("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/input.jpg"), + GetRgb( + "/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/medium_sub_rect_with_rotation.png"), + /*range_min=*/-1.0f, + /*range_max=*/1.0f, + /*tensor_width=*/256, /*tensor_height=*/256, /*keep_aspect=*/false, roi); +} + +TEST(ImageToTensorCalculatorTest, LargeSubRect) { + mediapipe::NormalizedRect roi; + roi.set_x_center(0.5f); + roi.set_y_center(0.5f); + roi.set_width(1.5f); + roi.set_height(1.1f); + roi.set_rotation(0); + RunTest(GetRgb("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/input.jpg"), + GetRgb("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/large_sub_rect.png"), + /*range_min=*/0.0f, + /*range_max=*/1.0f, + /*tensor_width=*/128, /*tensor_height=*/128, /*keep_aspect=*/false, + roi); +} + +TEST(ImageToTensorCalculatorTest, LargeSubRectKeepAspect) { + mediapipe::NormalizedRect roi; + roi.set_x_center(0.5f); + roi.set_y_center(0.5f); + roi.set_width(1.5f); + roi.set_height(1.1f); + roi.set_rotation(0); + RunTest( + GetRgb("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/input.jpg"), + GetRgb("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/large_sub_rect_keep_aspect.png"), + /*range_min=*/0.0f, + /*range_max=*/1.0f, + /*tensor_width=*/128, /*tensor_height=*/128, /*keep_aspect=*/true, roi); +} + +TEST(ImageToTensorCalculatorTest, LargeSubRectKeepAspectWithRotation) { + mediapipe::NormalizedRect roi; + roi.set_x_center(0.5f); + roi.set_y_center(0.5f); + roi.set_width(1.5f); + roi.set_height(1.1f); + roi.set_rotation(M_PI * -15.0f / 180.0f); + RunTest(GetRgba("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/input.jpg"), + GetRgb("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/" + "large_sub_rect_keep_aspect_with_rotation.png"), + /*range_min=*/0.0f, + /*range_max=*/1.0f, + /*tensor_width=*/128, /*tensor_height=*/128, /*keep_aspect=*/true, + roi); +} + +TEST(ImageToTensorCalculatorTest, NoOpExceptRange) { + mediapipe::NormalizedRect roi; + roi.set_x_center(0.5f); + roi.set_y_center(0.5f); + roi.set_width(1.0f); + roi.set_height(1.0f); + roi.set_rotation(0); + RunTest(GetRgba("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/input.jpg"), + GetRgb("/mediapipe/calculators/" + "tensor/testdata/image_to_tensor/noop_except_range.png"), + /*range_min=*/0.0f, + /*range_max=*/1.0f, + /*tensor_width=*/64, /*tensor_height=*/128, /*keep_aspect=*/true, + roi); +} + +} // namespace +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/image_to_tensor_converter.h b/mediapipe/calculators/tensor/image_to_tensor_converter.h new file mode 100644 index 0000000000..062195697b --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_converter.h @@ -0,0 +1,53 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_H_ +#define MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_H_ + +#include "mediapipe/calculators/tensor/image_to_tensor_utils.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/packet.h" +#include "mediapipe/framework/port/statusor.h" + +namespace mediapipe { + +struct Size { + int width; + int height; +}; + +// Converts image to tensor. +class ImageToTensorConverter { + public: + virtual ~ImageToTensorConverter() = default; + + virtual Size GetImageSize(const Packet& image_packet) = 0; + + // Converts image to tensor. + // @image_packet contains image to extract from. + // @roi describes region of interest within the image to extract (absolute + // values). + // @output_dims dimensions of output tensor. + // @range_min/max describes output tensor range image pixels should converted + // to. + virtual ::mediapipe::StatusOr Convert(const Packet& image_packet, + const RotatedRect& roi, + const Size& output_dims, + float range_min, + float range_max) = 0; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_H_ diff --git a/mediapipe/calculators/tensor/image_to_tensor_converter_gl_buffer.cc b/mediapipe/calculators/tensor/image_to_tensor_converter_gl_buffer.cc new file mode 100644 index 0000000000..fb1b9ab896 --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_converter_gl_buffer.cc @@ -0,0 +1,340 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/calculators/tensor/image_to_tensor_converter_gl_buffer.h" + +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + +#include +#include +#include + +#include "absl/strings/str_cat.h" +#include "mediapipe/calculators/tensor/image_to_tensor_converter.h" +#include "mediapipe/calculators/tensor/image_to_tensor_utils.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port/canonical_errors.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/framework/port/statusor.h" +#include "mediapipe/gpu/gl_calculator_helper.h" +#include "mediapipe/gpu/gpu_buffer.h" +#include "tensorflow/lite/delegates/gpu/common/shape.h" +#include "tensorflow/lite/delegates/gpu/common/types.h" +#include "tensorflow/lite/delegates/gpu/gl/command_queue.h" +#include "tensorflow/lite/delegates/gpu/gl/converters/util.h" +#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h" +#include "tensorflow/lite/delegates/gpu/gl/gl_call.h" +#include "tensorflow/lite/delegates/gpu/gl/gl_texture.h" +#include "tensorflow/lite/delegates/gpu/gl/request_gpu_info.h" +#include "tensorflow/lite/delegates/gpu/gl/variable.h" + +namespace mediapipe { + +namespace { + +// Implements a common pattern of extracting a subrect from RGBA input texture +// and resizing it into a buffer. +class SubRectExtractorGl { + public: + // Extracts a region defined by @sub_rect, removes A channel, transforms input + // pixels as alpha * x + beta and resizes result into destination. + ::mediapipe::Status ExtractSubRectToBuffer( + const tflite::gpu::gl::GlTexture& texture, + const tflite::gpu::HW& texture_size, const RotatedRect& sub_rect, + bool flip_horizontaly, float alpha, float beta, + const tflite::gpu::HW& destination_size, + tflite::gpu::gl::CommandQueue* command_queue, + tflite::gpu::gl::GlBuffer* destination); + + static ::mediapipe::StatusOr Create( + bool input_starts_at_bottom); + + private: + explicit SubRectExtractorGl(tflite::gpu::gl::GlProgram program, + tflite::gpu::uint3 workgroup_size) + : program_(std::move(program)), workgroup_size_(workgroup_size) {} + + tflite::gpu::gl::GlProgram program_; + tflite::gpu::uint3 workgroup_size_; +}; + +::mediapipe::Status SetMat4x4(const tflite::gpu::gl::GlProgram& program, + const std::string& name, float* data) { + GLint uniform_id; + MP_RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glGetUniformLocation, &uniform_id, + program.id(), name.c_str())); + return TFLITE_GPU_CALL_GL(glProgramUniformMatrix4fv, program.id(), uniform_id, + 1, GL_TRUE, data); +} + +class GlParametersOverride { + public: + static ::mediapipe::StatusOr Create( + const std::vector>& overrides) { + std::vector old_values(overrides.size()); + for (int i = 0; i < overrides.size(); ++i) { + MP_RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glGetTexParameteriv, GL_TEXTURE_2D, + overrides[i].first, + &old_values[i])); + if (overrides[i].second != old_values[i]) { + MP_RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glTexParameteri, GL_TEXTURE_2D, + overrides[i].first, + overrides[i].second)); + } + } + return GlParametersOverride(overrides, std::move(old_values)); + } + + ::mediapipe::Status Revert() { + for (int i = 0; i < overrides_.size(); ++i) { + if (overrides_[i].second != old_values_[i]) { + MP_RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glTexParameteri, GL_TEXTURE_2D, + overrides_[i].first, + old_values_[i])); + } + } + return ::mediapipe::OkStatus(); + } + + private: + GlParametersOverride(const std::vector>& overrides, + std::vector old_values) + : overrides_(overrides), old_values_(std::move(old_values)) {} + + std::vector> overrides_; + std::vector old_values_; +}; + +constexpr char kShaderCode[] = R"( +layout(std430) buffer; + +precision highp float; + +// It is possible to use "vec3 elements[];" here, however due to alignment +// requirements it works only when "packed" layout is used. "packed" layout is +// determined by implementation and it's expected that OpenGL API is used to +// query the layout. Favoring float array over vec3, considering performance is +// comparable, layout is the same and no need for layout querying (even though +// it's not quite needed here as there's only one member). +layout(binding = 0) writeonly buffer B0 { + float elements[]; +} output_data; + +uniform ivec2 out_size; +uniform float alpha; +uniform float beta; +uniform mat4 transform_matrix; +uniform mediump sampler2D input_data; + +void main() { + int out_width = out_size.x; + int out_height = out_size.y; + + ivec2 gid = ivec2(gl_GlobalInvocationID.xy); + if (gid.x >= out_width || gid.y >= out_height) { + return; + } + + // transform from image.width, image.height range to [0, 1] + float normal_x = (float(gid.x) + 0.5f) / float(out_width); + float normal_y = (float(gid.y) + 0.5f) / float(out_height); + vec4 tc = vec4(normal_x, normal_y, 0.0, 1.0); + + // Apply transformation from roi coordinates to original image coordinates. + tc = transform_matrix * tc; +#ifdef INPUT_STARTS_AT_BOTTOM + // Opengl texture sampler has origin in lower left corner, + // so we invert y coordinate. + tc.y = 1.0f - tc.y; +#endif // INPUT_STARTS_AT_BOTTOM + vec4 src_value = alpha * texture(input_data, tc.xy) + beta; + + int linear_index = gid.y * out_width + gid.x; + + // output_data.elements is populated as though it contains vec3 elements. + int first_component_index = 3 * linear_index; + output_data.elements[first_component_index] = src_value.r; + output_data.elements[first_component_index + 1] = src_value.g; + output_data.elements[first_component_index + 2] = src_value.b; +} +)"; + +::mediapipe::Status SubRectExtractorGl::ExtractSubRectToBuffer( + const tflite::gpu::gl::GlTexture& texture, + const tflite::gpu::HW& texture_size, const RotatedRect& texture_sub_rect, + bool flip_horizontaly, float alpha, float beta, + const tflite::gpu::HW& destination_size, + tflite::gpu::gl::CommandQueue* command_queue, + tflite::gpu::gl::GlBuffer* destination) { + std::array transform_mat; + GetRotatedSubRectToRectTransformMatrix(texture_sub_rect, texture_size.w, + texture_size.h, flip_horizontaly, + &transform_mat); + MP_RETURN_IF_ERROR(texture.BindAsSampler2D(0)); + + ASSIGN_OR_RETURN(auto overrides, GlParametersOverride::Create( + {{GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE}, + {GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE}, + {GL_TEXTURE_MIN_FILTER, GL_LINEAR}, + {GL_TEXTURE_MAG_FILTER, GL_LINEAR}})); + + MP_RETURN_IF_ERROR(destination->BindToIndex(0)); + MP_RETURN_IF_ERROR(program_.SetParameter({"input_data", 0})); + MP_RETURN_IF_ERROR( + SetMat4x4(program_, "transform_matrix", transform_mat.data())); + MP_RETURN_IF_ERROR(program_.SetParameter( + {"out_size", tflite::gpu::int2(destination_size.w, destination_size.h)})); + MP_RETURN_IF_ERROR(program_.SetParameter({"alpha", alpha})); + MP_RETURN_IF_ERROR(program_.SetParameter({"beta", beta})); + tflite::gpu::uint3 num_workgroups = tflite::gpu::DivideRoundUp( + tflite::gpu::uint3{destination_size.w, destination_size.h, 1}, + workgroup_size_); + MP_RETURN_IF_ERROR(command_queue->Dispatch(program_, num_workgroups)); + + return overrides.Revert(); +} + +::mediapipe::StatusOr SubRectExtractorGl::Create( + bool input_starts_at_bottom) { + const tflite::gpu::uint3 workgroup_size = {8, 8, 1}; + std::string starts_at_bottom_def; + if (input_starts_at_bottom) { + starts_at_bottom_def = R"( + #define INPUT_STARTS_AT_BOTTOM; + )"; + } + const std::string full_shader_source = + absl::StrCat(tflite::gpu::gl::GetShaderHeader(workgroup_size), + starts_at_bottom_def, kShaderCode); + + tflite::gpu::gl::GlShader shader; + MP_RETURN_IF_ERROR(tflite::gpu::gl::GlShader::CompileShader( + GL_COMPUTE_SHADER, full_shader_source, &shader)); + tflite::gpu::gl::GlProgram program; + MP_RETURN_IF_ERROR( + tflite::gpu::gl::GlProgram::CreateWithShader(shader, &program)); + + return SubRectExtractorGl(std::move(program), workgroup_size); +} + +class GlProcessor : public ImageToTensorConverter { + public: + ::mediapipe::Status Init(CalculatorContext* cc, bool input_starts_at_bottom) { + MP_RETURN_IF_ERROR(gl_helper_.Open(cc)); + return gl_helper_.RunInGlContext( + [this, input_starts_at_bottom]() -> ::mediapipe::Status { + tflite::gpu::GpuInfo gpu_info; + MP_RETURN_IF_ERROR(tflite::gpu::gl::RequestGpuInfo(&gpu_info)); + RET_CHECK(tflite::gpu::IsOpenGl31OrAbove(gpu_info)) + << "OpenGL ES 3.1 is required."; + command_queue_ = tflite::gpu::gl::NewCommandQueue(gpu_info); + + ASSIGN_OR_RETURN(auto extractor, + SubRectExtractorGl::Create(input_starts_at_bottom)); + extractor_ = + absl::make_unique(std::move(extractor)); + return ::mediapipe::OkStatus(); + }); + } + + Size GetImageSize(const Packet& image_packet) override { + const auto& image = image_packet.Get(); + return {image.width(), image.height()}; + } + + ::mediapipe::StatusOr Convert(const Packet& image_packet, + const RotatedRect& roi, + const Size& output_dims, + float range_min, + float range_max) override { + const auto& input = image_packet.Get(); + if (input.format() != mediapipe::GpuBufferFormat::kBGRA32) { + return InvalidArgumentError( + absl::StrCat("Only BGRA/RGBA textures are supported, passed format: ", + static_cast(input.format()))); + } + + constexpr int kNumChannels = 3; + Tensor tensor(Tensor::ElementType::kFloat32, + {1, output_dims.height, output_dims.width, kNumChannels}); + + MP_RETURN_IF_ERROR(gl_helper_.RunInGlContext( + [this, &tensor, &input, &roi, &output_dims, range_min, + range_max]() -> ::mediapipe::Status { + constexpr int kRgbaNumChannels = 4; + auto source_texture = gl_helper_.CreateSourceTexture(input); + tflite::gpu::gl::GlTexture input_texture( + GL_TEXTURE_2D, source_texture.name(), GL_RGBA, + source_texture.width() * source_texture.height() * + kRgbaNumChannels * sizeof(uint8_t), + /*layer=*/0, + /*owned=*/false); + + constexpr float kInputImageRangeMin = 0.0f; + constexpr float kInputImageRangeMax = 1.0f; + ASSIGN_OR_RETURN(auto transform, + GetValueRangeTransformation(kInputImageRangeMin, + kInputImageRangeMax, + range_min, range_max)); + + auto buffer_view = tensor.GetOpenGlBufferWriteView(); + tflite::gpu::gl::GlBuffer output(GL_SHADER_STORAGE_BUFFER, + buffer_view.name(), tensor.bytes(), + /*offset=*/0, + /*has_ownership=*/false); + MP_RETURN_IF_ERROR(extractor_->ExtractSubRectToBuffer( + input_texture, + tflite::gpu::HW(source_texture.height(), source_texture.width()), + roi, + /*flip_horizontaly=*/false, transform.scale, transform.offset, + tflite::gpu::HW(output_dims.height, output_dims.width), + command_queue_.get(), &output)); + + return ::mediapipe::OkStatus(); + })); + + return tensor; + } + + ~GlProcessor() override { + gl_helper_.RunInGlContext([this]() { + // Release OpenGL resources. + extractor_ = nullptr; + command_queue_ = nullptr; + }); + } + + private: + std::unique_ptr command_queue_; + std::unique_ptr extractor_; + mediapipe::GlCalculatorHelper gl_helper_; +}; + +} // namespace + +::mediapipe::StatusOr> +CreateImageToGlBufferTensorConverter(CalculatorContext* cc, + bool input_starts_at_bottom) { + auto result = absl::make_unique(); + MP_RETURN_IF_ERROR(result->Init(cc, input_starts_at_bottom)); + + // Simply "return std::move(result)" failed to build on macOS with bazel. + return std::unique_ptr(std::move(result)); +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 diff --git a/mediapipe/calculators/tensor/image_to_tensor_converter_gl_buffer.h b/mediapipe/calculators/tensor/image_to_tensor_converter_gl_buffer.h new file mode 100644 index 0000000000..51ca9172ff --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_converter_gl_buffer.h @@ -0,0 +1,41 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_GL_BUFFER_H_ +#define MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_GL_BUFFER_H_ + +#include "mediapipe/framework/port.h" + +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + +#include + +#include "mediapipe/calculators/tensor/image_to_tensor_converter.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/statusor.h" + +namespace mediapipe { + +// Creates image to tensor (represented as OpenGL buffer) converter. +// NOTE: mediapipe::GlCalculatorHelper::UpdateContract invocation must precede +// converter creation. +::mediapipe::StatusOr> +CreateImageToGlBufferTensorConverter(CalculatorContext* cc, + bool input_starts_at_bottom); + +} // namespace mediapipe + +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + +#endif // MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_GL_BUFFER_H_ diff --git a/mediapipe/calculators/tensor/image_to_tensor_converter_gl_texture.cc b/mediapipe/calculators/tensor/image_to_tensor_converter_gl_texture.cc new file mode 100644 index 0000000000..b02fb98c0b --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_converter_gl_texture.cc @@ -0,0 +1,323 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/calculators/tensor/image_to_tensor_converter_gl_texture.h" + +#include "mediapipe/framework/port.h" + +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_20 + +#include +#include +#include + +#include "absl/strings/str_cat.h" +#include "mediapipe/calculators/tensor/image_to_tensor_converter.h" +#include "mediapipe/calculators/tensor/image_to_tensor_utils.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port/canonical_errors.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/framework/port/statusor.h" +#include "mediapipe/gpu/gl_calculator_helper.h" +#include "mediapipe/gpu/gl_simple_shaders.h" +#include "mediapipe/gpu/gpu_buffer.h" +#include "mediapipe/gpu/shader_util.h" + +namespace mediapipe { + +namespace { + +class GlParametersOverride { + public: + static ::mediapipe::StatusOr Create( + const std::vector>& overrides) { + std::vector old_values(overrides.size()); + for (int i = 0; i < overrides.size(); ++i) { + glGetTexParameteriv(GL_TEXTURE_2D, overrides[i].first, &old_values[i]); + if (overrides[i].second != old_values[i]) { + glTexParameteri(GL_TEXTURE_2D, overrides[i].first, overrides[i].second); + } + } + return GlParametersOverride(overrides, std::move(old_values)); + } + + ::mediapipe::Status Revert() { + for (int i = 0; i < overrides_.size(); ++i) { + if (overrides_[i].second != old_values_[i]) { + glTexParameteri(GL_TEXTURE_2D, overrides_[i].first, old_values_[i]); + } + } + return ::mediapipe::OkStatus(); + } + + private: + GlParametersOverride(const std::vector>& overrides, + std::vector old_values) + : overrides_(overrides), old_values_(std::move(old_values)) {} + + std::vector> overrides_; + std::vector old_values_; +}; + +constexpr int kAttribVertex = 0; +constexpr int kAttribTexturePosition = 1; +constexpr int kNumAttributes = 2; + +class GlProcessor : public ImageToTensorConverter { + public: + ::mediapipe::Status Init(CalculatorContext* cc, bool input_starts_at_bottom) { + MP_RETURN_IF_ERROR(gl_helper_.Open(cc)); + return gl_helper_.RunInGlContext([this, input_starts_at_bottom]() + -> ::mediapipe::Status { + const GLint attr_location[kNumAttributes] = { + kAttribVertex, + kAttribTexturePosition, + }; + const GLchar* attr_name[kNumAttributes] = { + "position", + "texture_coordinate", + }; + + constexpr GLchar kExtractSubRectVertexShader[] = R"( + in vec4 position; + in mediump vec4 texture_coordinate; + out mediump vec2 sample_coordinate; + uniform mat4 transform_matrix; + + void main() { + gl_Position = position; + // Apply transformation from roi coordinates to original image coordinates. + vec4 tc = transform_matrix * texture_coordinate; + #ifdef INPUT_STARTS_AT_BOTTOM + // Opengl texture sampler has origin in lower left corner, + // so we invert y coordinate. + tc.y = 1.0 - tc.y; + #endif // defined(INPUT_STARTS_AT_BOTTOM) + sample_coordinate = tc.xy; + } + )"; + + constexpr GLchar kExtractSubRectFragBody[] = R"( + DEFAULT_PRECISION(mediump, float) + + // Provided by kExtractSubRectVertexShader. + in vec2 sample_coordinate; + + uniform sampler2D input_texture; + uniform float alpha; + uniform float beta; + + #ifdef GL_ES + #define fragColor gl_FragColor + #else + out vec4 fragColor; + #endif // defined(GL_ES); + + void main() { + fragColor = alpha * texture2D(input_texture, sample_coordinate) + beta; + } + )"; + + std::string starts_at_bottom_def; + if (input_starts_at_bottom) { + starts_at_bottom_def = R"( + #define INPUT_STARTS_AT_BOTTOM + )"; + } + + // Create program and set parameters. + const std::string extract_sub_rect_vertex_src = + absl::StrCat(mediapipe::kMediaPipeVertexShaderPreamble, + starts_at_bottom_def, kExtractSubRectVertexShader); + const std::string extract_sub_rect_frag_src = absl::StrCat( + mediapipe::kMediaPipeFragmentShaderPreamble, kExtractSubRectFragBody); + mediapipe::GlhCreateProgram(extract_sub_rect_vertex_src.c_str(), + extract_sub_rect_frag_src.c_str(), + kNumAttributes, &attr_name[0], attr_location, + &program_); + + RET_CHECK(program_) << "Problem initializing image to tensor program."; + glUseProgram(program_); + glUniform1i(glGetUniformLocation(program_, "input_texture"), 1); + alpha_id_ = glGetUniformLocation(program_, "alpha"); + beta_id_ = glGetUniformLocation(program_, "beta"); + matrix_id_ = glGetUniformLocation(program_, "transform_matrix"); + + glGenFramebuffers(1, &framebuffer_); + + // vertex storage + glGenBuffers(2, vbo_); + glGenVertexArrays(1, &vao_); + + // vbo 0 + glBindBuffer(GL_ARRAY_BUFFER, vbo_[0]); + glBufferData(GL_ARRAY_BUFFER, sizeof(mediapipe::kBasicSquareVertices), + mediapipe::kBasicSquareVertices, GL_STATIC_DRAW); + + // vbo 1 + glBindBuffer(GL_ARRAY_BUFFER, vbo_[1]); + glBufferData(GL_ARRAY_BUFFER, sizeof(mediapipe::kBasicTextureVertices), + mediapipe::kBasicTextureVertices, GL_STATIC_DRAW); + + glBindBuffer(GL_ARRAY_BUFFER, 0); + + return ::mediapipe::OkStatus(); + }); + } + + Size GetImageSize(const Packet& image_packet) override { + const auto& image = image_packet.Get(); + return {image.width(), image.height()}; + } + + ::mediapipe::StatusOr Convert(const Packet& image_packet, + const RotatedRect& roi, + const Size& output_dims, + float range_min, + float range_max) override { + const auto& input = image_packet.Get(); + if (input.format() != mediapipe::GpuBufferFormat::kBGRA32) { + return InvalidArgumentError( + absl::StrCat("Only BGRA/RGBA textures are supported, passed format: ", + static_cast(input.format()))); + } + + constexpr int kNumChannels = 3; + Tensor tensor( + Tensor::ElementType::kFloat32, + Tensor::Shape{1, output_dims.height, output_dims.width, kNumChannels}); + + MP_RETURN_IF_ERROR(gl_helper_.RunInGlContext( + [this, &tensor, &input, &roi, &output_dims, range_min, + range_max]() -> ::mediapipe::Status { + auto input_texture = gl_helper_.CreateSourceTexture(input); + + constexpr float kInputImageRangeMin = 0.0f; + constexpr float kInputImageRangeMax = 1.0f; + ASSIGN_OR_RETURN(auto transform, + GetValueRangeTransformation(kInputImageRangeMin, + kInputImageRangeMax, + range_min, range_max)); + auto tensor_view = tensor.GetOpenGlTexture2dWriteView(); + MP_RETURN_IF_ERROR(ExtractSubRect(input_texture, roi, + /*flip_horizontaly=*/false, + transform.scale, transform.offset, + output_dims, &tensor_view)); + return ::mediapipe::OkStatus(); + })); + + return tensor; + } + + ::mediapipe::Status ExtractSubRect(const mediapipe::GlTexture& texture, + const RotatedRect& sub_rect, + bool flip_horizontaly, float alpha, + float beta, const Size& output_dims, + Tensor::OpenGlTexture2dView* output) { + std::array transform_mat; + GetRotatedSubRectToRectTransformMatrix(sub_rect, texture.width(), + texture.height(), flip_horizontaly, + &transform_mat); + + glDisable(GL_DEPTH_TEST); + glBindFramebuffer(GL_FRAMEBUFFER, framebuffer_); + glViewport(0, 0, output_dims.width, output_dims.height); + + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, output->name()); + glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, + output->name(), 0); + + glActiveTexture(GL_TEXTURE1); + glBindTexture(texture.target(), texture.name()); + + ASSIGN_OR_RETURN(auto overrides, GlParametersOverride::Create( + {{GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE}, + {GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE}, + {GL_TEXTURE_MIN_FILTER, GL_LINEAR}, + {GL_TEXTURE_MAG_FILTER, GL_LINEAR}})); + + glUseProgram(program_); + glUniform1f(alpha_id_, alpha); + glUniform1f(beta_id_, beta); + glUniformMatrix4fv(matrix_id_, 1, GL_TRUE, transform_mat.data()); + + // vao + glBindVertexArray(vao_); + + // vbo 0 + glBindBuffer(GL_ARRAY_BUFFER, vbo_[0]); + glEnableVertexAttribArray(kAttribVertex); + glVertexAttribPointer(kAttribVertex, 2, GL_FLOAT, 0, 0, nullptr); + + // vbo 1 + glBindBuffer(GL_ARRAY_BUFFER, vbo_[1]); + glEnableVertexAttribArray(kAttribTexturePosition); + glVertexAttribPointer(kAttribTexturePosition, 2, GL_FLOAT, 0, 0, nullptr); + + // draw + glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); + + // cleanup + glDisableVertexAttribArray(kAttribVertex); + glDisableVertexAttribArray(kAttribTexturePosition); + glBindBuffer(GL_ARRAY_BUFFER, 0); + glBindVertexArray(0); + + glActiveTexture(GL_TEXTURE1); + glBindTexture(GL_TEXTURE_2D, 0); + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, 0); + + return overrides.Revert(); + } + + ~GlProcessor() override { + gl_helper_.RunInGlContext([this]() { + // Release OpenGL resources. + if (framebuffer_ != 0) glDeleteFramebuffers(1, &framebuffer_); + if (program_ != 0) glDeleteProgram(program_); + if (vao_ != 0) glDeleteVertexArrays(1, &vao_); + glDeleteBuffers(2, vbo_); + }); + } + + private: + mediapipe::GlCalculatorHelper gl_helper_; + GLuint vao_ = 0; + GLuint vbo_[2] = {0, 0}; + GLuint program_ = 0; + GLuint framebuffer_ = 0; + GLint alpha_id_ = 0; + GLint beta_id_ = 0; + GLint matrix_id_ = 0; +}; + +} // namespace + +::mediapipe::StatusOr> +CreateImageToGlTextureTensorConverter(CalculatorContext* cc, + bool input_starts_at_bottom) { + auto result = absl::make_unique(); + MP_RETURN_IF_ERROR(result->Init(cc, input_starts_at_bottom)); + + // Simply "return std::move(result)" failed to build on macOS with bazel. + return std::unique_ptr(std::move(result)); +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_20 diff --git a/mediapipe/calculators/tensor/image_to_tensor_converter_gl_texture.h b/mediapipe/calculators/tensor/image_to_tensor_converter_gl_texture.h new file mode 100644 index 0000000000..4ae224e7da --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_converter_gl_texture.h @@ -0,0 +1,42 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_GL_TEXTURE_H_ + +#define MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_GL_TEXTURE_H_ + +#include "mediapipe/framework/port.h" + +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_20 + +#include + +#include "mediapipe/calculators/tensor/image_to_tensor_converter.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/statusor.h" + +namespace mediapipe { + +// Creates image to tensor (represented as OpenGL texture) converter. +// NOTE: mediapipe::GlCalculatorHelper::UpdateContract invocation must precede +// converter creation. +::mediapipe::StatusOr> +CreateImageToGlTextureTensorConverter(CalculatorContext* cc, + bool input_starts_at_bottom); + +} // namespace mediapipe + +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_20 + +#endif // MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_GL_TEXTURE_H_ diff --git a/mediapipe/calculators/tensor/image_to_tensor_converter_metal.cc b/mediapipe/calculators/tensor/image_to_tensor_converter_metal.cc new file mode 100644 index 0000000000..2acb127e3e --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_converter_metal.cc @@ -0,0 +1,397 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/calculators/tensor/image_to_tensor_converter_metal.h" + +#if MEDIAPIPE_METAL_ENABLED + +#import + +#include +#include +#include + +#include "absl/strings/str_cat.h" +#include "mediapipe/calculators/tensor/image_to_tensor_converter.h" +#include "mediapipe/calculators/tensor/image_to_tensor_utils.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port/canonical_errors.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/framework/port/statusor.h" +#include "mediapipe/gpu/MPPMetalHelper.h" +#include "mediapipe/gpu/gpu_buffer.h" +#include "tensorflow/lite/delegates/gpu/common/shape.h" +#include "tensorflow/lite/delegates/gpu/common/types.h" + +namespace mediapipe { + +namespace { + +// clang-format off +// a square formed by 2 triangles +const float kBasicSquareVertices[] = { + -1, 1, 0, 1, + 1, 1, 0, 1, + 1, -1, 0, 1, + -1, 1, 0, 1, + 1, -1, 0, 1, + -1, -1, 0, 1, +}; + +// maps a texture to kBasicSquareVertices via aspect fill +const float kBasicTextureVertices[] = { + 0, 0, 0, 1, + 1, 0, 0, 1, + 1, 1, 0, 1, + 0, 0, 0, 1, + 1, 1, 0, 1, + 0, 1, 0, 1, +}; +// clang-format on + +constexpr char kShaderLibHeader[] = R"( + #include + + using namespace metal; + + struct TextureVertex + { + float4 position [[position]]; + float2 uv; + }; +)"; + +constexpr char kVertexShader[] = R"( + vertex TextureVertex vertexShader( + constant float4 *position [[buffer(0)]], + device float4* tex_coords [[buffer(1)]], + constant float4x4& transform_matrix [[buffer(2)]], + uint vid [[vertex_id]]) { + TextureVertex vert; + vert.position = position[vid]; + vert.uv = (tex_coords[vid] * transform_matrix).xy; + return vert; + } +)"; + +constexpr char kFragmentShader[] = R"( + #ifdef OUTPUT_F16C4 + #define Type4 half4 + #define Type half + #endif // OUTPUT_F16C4 + + #ifdef OUTPUT_F32C4 + #define Type4 float4 + #define Type float + #endif // OUTPUT_F32C4 + + fragment Type4 fragmentShader(TextureVertex vertex_output [[stage_in]], + texture2d texture [[texture(0)]], + constant float* parameters [[buffer(1)]]) + { + const float alpha = parameters[0]; + const float beta = parameters[1]; + + constexpr sampler linear_sampler(address::clamp_to_edge, min_filter::linear, + mag_filter::linear); + + Type4 texture_pixel = texture.sample(linear_sampler, vertex_output.uv); + return Type4(alpha * texture_pixel.rgb + beta, 0); + } +)"; + +enum class OutputFormat { kF16C4, kF32C4 }; + +MTLPixelFormat GetPixelFormat(OutputFormat output_format) { + switch (output_format) { + case OutputFormat::kF16C4: + return MTLPixelFormatRGBA16Float; + case OutputFormat::kF32C4: + return MTLPixelFormatRGBA32Float; + } +} +int GetBytesPerRaw(OutputFormat output_format, const tflite::gpu::HW& size) { + std::size_t type_size; + switch (output_format) { + case OutputFormat::kF16C4: + type_size = sizeof(tflite::gpu::HalfBits); + break; + case OutputFormat::kF32C4: + type_size = sizeof(float); + break; + } + constexpr int kNumChannels = 4; + return size.w * kNumChannels * type_size; +} + +class SubRectExtractorMetal { + public: + static ::mediapipe::StatusOr> Make( + id device, OutputFormat output_format) { + id pipeline_state; + MP_RETURN_IF_ERROR(SubRectExtractorMetal::MakePipelineState( + device, output_format, &pipeline_state)); + + return absl::make_unique(device, pipeline_state, + output_format); + } + + SubRectExtractorMetal(id device, + id pipeline_state, + OutputFormat output_format) + : device_(device), + pipeline_state_(pipeline_state), + output_format_(output_format) { + positions_buffer_ = + [device_ newBufferWithBytes:kBasicSquareVertices + length:sizeof(kBasicSquareVertices) + options:MTLResourceOptionCPUCacheModeDefault]; + + tex_coords_buffer_ = + [device_ newBufferWithBytes:kBasicTextureVertices + length:sizeof(kBasicTextureVertices) + options:MTLResourceOptionCPUCacheModeDefault]; + + transform_mat_buffer_ = + [device_ newBufferWithBytes:&transform_mat_ + length:sizeof(transform_mat_) + options:MTLResourceOptionCPUCacheModeDefault]; + } + + ::mediapipe::Status Execute(id input_texture, + const RotatedRect& sub_rect, + bool flip_horizontaly, float alpha, float beta, + const tflite::gpu::HW& destination_size, + id command_buffer, + id destination) { + auto output_texture = MTLTextureWithBuffer(destination_size, destination); + return InternalExecute(input_texture, sub_rect, flip_horizontaly, alpha, + beta, destination_size, command_buffer, + output_texture); + } + + private: + id MTLTextureWithBuffer(const tflite::gpu::HW& size, + id buffer) { + MTLTextureDescriptor* texture_desc = [MTLTextureDescriptor + texture2DDescriptorWithPixelFormat:GetPixelFormat(output_format_) + width:size.w + height:size.h + mipmapped:NO]; + texture_desc.usage = MTLTextureUsageRenderTarget; + + NSUInteger output_bytes_per_row = GetBytesPerRaw(output_format_, size); + + id texture = + [buffer newTextureWithDescriptor:texture_desc + offset:0 + bytesPerRow:output_bytes_per_row]; + return texture; + } + + ::mediapipe::Status InternalExecute(id input_texture, + const RotatedRect& sub_rect, + bool flip_horizontaly, float alpha, + float beta, + const tflite::gpu::HW& destination_size, + id command_buffer, + id output_texture) { + RET_CHECK(command_buffer != nil); + RET_CHECK(output_texture != nil); + + // Obtain texture mapping coordinates transformation matrix and copy its + // data to the buffer. + GetRotatedSubRectToRectTransformMatrix(sub_rect, input_texture.width, + input_texture.height, + flip_horizontaly, &transform_mat_); + std::memcpy(reinterpret_cast(transform_mat_buffer_.contents), + transform_mat_.data(), sizeof(transform_mat_)); + + // Create parameters wrapper. + float parameters[] = {alpha, beta}; + + // Now everything is ready to go! + // Setup render pass. + MTLRenderPassDescriptor* render_pass_desc = + [MTLRenderPassDescriptor renderPassDescriptor]; + render_pass_desc.colorAttachments[0].texture = output_texture; + render_pass_desc.colorAttachments[0].storeAction = MTLStoreActionStore; + render_pass_desc.colorAttachments[0].loadAction = MTLLoadActionClear; + + // Setup render command encoder. + id command_encoder = + [command_buffer renderCommandEncoderWithDescriptor:render_pass_desc]; + [command_encoder setRenderPipelineState:pipeline_state_]; + [command_encoder setVertexBuffer:positions_buffer_ offset:0 atIndex:0]; + [command_encoder setVertexBuffer:tex_coords_buffer_ offset:0 atIndex:1]; + [command_encoder setVertexBuffer:transform_mat_buffer_ offset:0 atIndex:2]; + [command_encoder setFragmentTexture:input_texture atIndex:0]; + [command_encoder setFragmentBytes:¶meters + length:sizeof(parameters) + atIndex:1]; + + [command_encoder drawPrimitives:MTLPrimitiveTypeTriangle + vertexStart:0 + vertexCount:6]; + [command_encoder endEncoding]; + + return ::mediapipe::OkStatus(); + } + + static ::mediapipe::Status MakePipelineState( + id device, OutputFormat output_format, + id* pipeline_state) { + RET_CHECK(pipeline_state != nil); + + std::string output_type_def; + MTLPixelFormat pixel_format; + switch (output_format) { + case OutputFormat::kF16C4: + output_type_def = R"( + #define OUTPUT_F16C4 + )"; + break; + case OutputFormat::kF32C4: + output_type_def = R"( + #define OUTPUT_F32C4 + )"; + break; + } + + std::string shader_lib = absl::StrCat(kShaderLibHeader, output_type_def, + kVertexShader, kFragmentShader); + NSError* error = nil; + NSString* library_source = + [NSString stringWithUTF8String:shader_lib.c_str()]; + + id library = + [device newLibraryWithSource:library_source options:nil error:&error]; + RET_CHECK(library != nil) << "Couldn't create a shader library" + << [[error localizedDescription] UTF8String]; + + id vertex_function = + [library newFunctionWithName:@"vertexShader"]; + RET_CHECK(vertex_function != nil) + << "Failed creating a new vertex function!"; + + id fragment_function = + [library newFunctionWithName:@"fragmentShader"]; + RET_CHECK(fragment_function != nil) + << "Failed creating a new fragment function!"; + + MTLRenderPipelineDescriptor* pipelineDescriptor = + [MTLRenderPipelineDescriptor new]; + pipelineDescriptor.vertexFunction = vertex_function; + pipelineDescriptor.fragmentFunction = fragment_function; + pipelineDescriptor.colorAttachments[0].pixelFormat = + GetPixelFormat(output_format); + + *pipeline_state = + [device newRenderPipelineStateWithDescriptor:pipelineDescriptor + error:&error]; + RET_CHECK(error == nil) << "Couldn't create a pipeline state" + << [[error localizedDescription] UTF8String]; + + return ::mediapipe::OkStatus(); + } + + id positions_buffer_; + id tex_coords_buffer_; + id transform_mat_buffer_; + id device_; + id pipeline_state_; + std::array transform_mat_; + OutputFormat output_format_; +}; + +class MetalProcessor : public ImageToTensorConverter { + public: + ::mediapipe::Status Init(CalculatorContext* cc) { + metal_helper_ = [[MPPMetalHelper alloc] initWithCalculatorContext:cc]; + RET_CHECK(metal_helper_); + ASSIGN_OR_RETURN(extractor_, + SubRectExtractorMetal::Make(metal_helper_.mtlDevice, + OutputFormat::kF32C4)); + return ::mediapipe::OkStatus(); + } + + Size GetImageSize(const Packet& image_packet) override { + const auto& image = image_packet.Get(); + return {image.width(), image.height()}; + } + + ::mediapipe::StatusOr Convert(const Packet& image_packet, + const RotatedRect& roi, + const Size& output_dims, + float range_min, + float range_max) override { + const auto& input = image_packet.Get(); + if (input.format() != mediapipe::GpuBufferFormat::kBGRA32) { + return InvalidArgumentError( + absl::StrCat("Only BGRA/RGBA textures are supported, passed " + "format: ", + static_cast(input.format()))); + } + + @autoreleasepool { + id texture = [metal_helper_ metalTextureWithGpuBuffer:input]; + + constexpr int kNumChannels = 4; + Tensor tensor(Tensor::ElementType::kFloat32, + Tensor::Shape{1, output_dims.height, output_dims.width, + kNumChannels}); + + constexpr float kInputImageRangeMin = 0.0f; + constexpr float kInputImageRangeMax = 1.0f; + ASSIGN_OR_RETURN( + auto transform, + GetValueRangeTransformation(kInputImageRangeMin, kInputImageRangeMax, + range_min, range_max)); + + id command_buffer = [metal_helper_ commandBuffer]; + const auto& buffer_view = tensor.GetMtlBufferWriteView(command_buffer); + MP_RETURN_IF_ERROR(extractor_->Execute( + texture, roi, + /*flip_horizontaly=*/false, transform.scale, transform.offset, + tflite::gpu::HW(output_dims.height, output_dims.width), + command_buffer, buffer_view.buffer())); + [command_buffer commit]; + // TODO: consider removing waitUntilCompleted + [command_buffer waitUntilCompleted]; + + return tensor; + } + } + + private: + MPPMetalHelper* metal_helper_ = nil; + std::unique_ptr extractor_; +}; + +} // namespace + +::mediapipe::StatusOr> +CreateMetalConverter(CalculatorContext* cc) { + auto result = absl::make_unique(); + MP_RETURN_IF_ERROR(result->Init(cc)); + + // Simply "return std::move(result)" failed to build on macOS with bazel. + return std::unique_ptr(std::move(result)); +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_METAL_ENABLED diff --git a/mediapipe/calculators/tensor/image_to_tensor_converter_metal.h b/mediapipe/calculators/tensor/image_to_tensor_converter_metal.h new file mode 100644 index 0000000000..c20c2cf5c3 --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_converter_metal.h @@ -0,0 +1,40 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_METAL_H_ +#define MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_METAL_H_ + +#include "mediapipe/framework/port.h" + +#if MEDIAPIPE_METAL_ENABLED + +#include + +#include "mediapipe/calculators/tensor/image_to_tensor_converter.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/statusor.h" + +namespace mediapipe { + +// Creates Metal image-to-tensor converter. +// NOTE: [MPPMetalHelper updateContract:...] invocation must precede +// converter creation. +::mediapipe::StatusOr> +CreateMetalConverter(CalculatorContext* cc); + +} // namespace mediapipe + +#endif // MEDIAPIPE_METAL_ENABLED + +#endif // MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_METAL_H_ diff --git a/mediapipe/calculators/tensor/image_to_tensor_converter_opencv.cc b/mediapipe/calculators/tensor/image_to_tensor_converter_opencv.cc new file mode 100644 index 0000000000..8c49c93220 --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_converter_opencv.cc @@ -0,0 +1,116 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/calculators/tensor/image_to_tensor_converter_opencv.h" + +#include +#include + +#include "mediapipe/calculators/tensor/image_to_tensor_converter.h" +#include "mediapipe/calculators/tensor/image_to_tensor_utils.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/image_format.pb.h" +#include "mediapipe/framework/formats/image_frame.h" +#include "mediapipe/framework/formats/image_frame_opencv.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port/canonical_errors.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/framework/port/opencv_imgproc_inc.h" +#include "mediapipe/framework/port/statusor.h" + +namespace mediapipe { + +namespace { + +class OpenCvProcessor : public ImageToTensorConverter { + public: + Size GetImageSize(const Packet& image_packet) override { + const auto& image = image_packet.Get(); + return {image.Width(), image.Height()}; + } + + ::mediapipe::StatusOr Convert(const Packet& image_packet, + const RotatedRect& roi, + const Size& output_dims, + float range_min, + float range_max) override { + const auto& input = image_packet.Get(); + if (input.Format() != mediapipe::ImageFormat::SRGB && + input.Format() != mediapipe::ImageFormat::SRGBA) { + return InvalidArgumentError( + absl::StrCat("Only RGBA/RGB formats are supported, passed format: ", + static_cast(input.Format()))); + } + cv::Mat src = mediapipe::formats::MatView(&input); + + constexpr int kNumChannels = 3; + Tensor tensor( + Tensor::ElementType::kFloat32, + Tensor::Shape{1, output_dims.height, output_dims.width, kNumChannels}); + auto buffer_view = tensor.GetCpuWriteView(); + cv::Mat dst(output_dims.height, output_dims.width, CV_32FC3, + buffer_view.buffer()); + + const cv::RotatedRect rotated_rect(cv::Point2f(roi.center_x, roi.center_y), + cv::Size2f(roi.width, roi.height), + roi.rotation * 180.f / M_PI); + cv::Mat src_points; + cv::boxPoints(rotated_rect, src_points); + + const float dst_width = output_dims.width; + const float dst_height = output_dims.height; + /* clang-format off */ + float dst_corners[8] = {0.0f, dst_height, + 0.0f, 0.0f, + dst_width, 0.0f, + dst_width, dst_height}; + /* clang-format on */ + + cv::Mat dst_points = cv::Mat(4, 2, CV_32F, dst_corners); + cv::Mat projection_matrix = + cv::getPerspectiveTransform(src_points, dst_points); + cv::Mat transformed; + cv::warpPerspective(src, transformed, projection_matrix, + cv::Size(dst_width, dst_height), + /*flags=*/cv::INTER_LINEAR, + /*borderMode=*/cv::BORDER_REPLICATE); + + if (transformed.channels() > kNumChannels) { + cv::Mat proper_channels_mat; + cv::cvtColor(transformed, proper_channels_mat, cv::COLOR_RGBA2RGB); + transformed = proper_channels_mat; + } + + constexpr float kInputImageRangeMin = 0.0f; + constexpr float kInputImageRangeMax = 255.0f; + ASSIGN_OR_RETURN( + auto transform, + GetValueRangeTransformation(kInputImageRangeMin, kInputImageRangeMax, + range_min, range_max)); + transformed.convertTo(dst, CV_32FC3, transform.scale, transform.offset); + return tensor; + } +}; + +} // namespace + +::mediapipe::StatusOr> +CreateOpenCvConverter(CalculatorContext* cc) { + // Simply "return absl::make_unique()" failed to build on + // macOS with bazel. + return std::unique_ptr( + absl::make_unique()); +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/image_to_tensor_converter_opencv.h b/mediapipe/calculators/tensor/image_to_tensor_converter_opencv.h new file mode 100644 index 0000000000..a667029ec1 --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_converter_opencv.h @@ -0,0 +1,32 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_OPENCV_H_ +#define MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_OPENCV_H_ + +#include + +#include "mediapipe/calculators/tensor/image_to_tensor_converter.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/statusor.h" + +namespace mediapipe { + +// Creates OpenCV image-to-tensor converter. +::mediapipe::StatusOr> +CreateOpenCvConverter(CalculatorContext* cc); + +} // namespace mediapipe + +#endif // MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_CONVERTER_OPENCV_H_ diff --git a/mediapipe/calculators/tensor/image_to_tensor_utils.cc b/mediapipe/calculators/tensor/image_to_tensor_utils.cc new file mode 100644 index 0000000000..c2bfc0f533 --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_utils.cc @@ -0,0 +1,176 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/calculators/tensor/image_to_tensor_utils.h" + +#include + +#include "absl/types/optional.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/statusor.h" + +namespace mediapipe { + +RotatedRect GetRoi(int input_width, int input_height, + absl::optional norm_rect) { + if (norm_rect) { + return {.center_x = norm_rect->x_center() * input_width, + .center_y = norm_rect->y_center() * input_height, + .width = norm_rect->width() * input_width, + .height = norm_rect->height() * input_height, + .rotation = norm_rect->rotation()}; + } + return {.center_x = 0.5f * input_width, + .center_y = 0.5f * input_height, + .width = static_cast(input_width), + .height = static_cast(input_height), + .rotation = 0}; +} + +::mediapipe::StatusOr> PadRoi(int input_tensor_width, + int input_tensor_height, + bool keep_aspect_ratio, + RotatedRect* roi) { + if (!keep_aspect_ratio) { + return std::array{0.0f, 0.0f, 0.0f, 0.0f}; + } + + RET_CHECK(input_tensor_width > 0 && input_tensor_height > 0) + << "Input tensor width and height must be > 0."; + const float tensor_aspect_ratio = + static_cast(input_tensor_height) / input_tensor_width; + + RET_CHECK(roi->width > 0 && roi->height > 0) + << "ROI width and height must be > 0."; + const float roi_aspect_ratio = roi->height / roi->width; + + float vertical_padding = 0.0f; + float horizontal_padding = 0.0f; + float new_width; + float new_height; + if (tensor_aspect_ratio > roi_aspect_ratio) { + new_width = roi->width; + new_height = roi->width * tensor_aspect_ratio; + vertical_padding = (1.0f - roi_aspect_ratio / tensor_aspect_ratio) / 2.0f; + } else { + new_width = roi->height / tensor_aspect_ratio; + new_height = roi->height; + horizontal_padding = (1.0f - tensor_aspect_ratio / roi_aspect_ratio) / 2.0f; + } + + roi->width = new_width; + roi->height = new_height; + + return std::array{horizontal_padding, vertical_padding, + horizontal_padding, vertical_padding}; +} + +::mediapipe::StatusOr GetValueRangeTransformation( + float from_range_min, float from_range_max, float to_range_min, + float to_range_max) { + RET_CHECK_LT(from_range_min, from_range_max) + << "Invalid FROM range: min >= max."; + RET_CHECK_LT(to_range_min, to_range_max) << "Invalid TO range: min >= max."; + const float scale = + (to_range_max - to_range_min) / (from_range_max - from_range_min); + const float offset = to_range_min - from_range_min * scale; + return ValueTransformation{scale, offset}; +} + +void GetRotatedSubRectToRectTransformMatrix(const RotatedRect& sub_rect, + int rect_width, int rect_height, + bool flip_horizontaly, + std::array* matrix_ptr) { + std::array& matrix = *matrix_ptr; + // The resulting matrix is multiplication of below commented out matrices: + // post_scale_matrix + // * translate_matrix + // * rotate_matrix + // * flip_matrix + // * scale_matrix + // * initial_translate_matrix + + // Matrix to convert X,Y to [-0.5, 0.5] range "initial_translate_matrix" + // { 1.0f, 0.0f, 0.0f, -0.5f} + // { 0.0f, 1.0f, 0.0f, -0.5f} + // { 0.0f, 0.0f, 1.0f, 0.0f} + // { 0.0f, 0.0f, 0.0f, 1.0f} + + const float a = sub_rect.width; + const float b = sub_rect.height; + // Matrix to scale X,Y,Z to sub rect "scale_matrix" + // Z has the same scale as X. + // { a, 0.0f, 0.0f, 0.0f} + // {0.0f, b, 0.0f, 0.0f} + // {0.0f, 0.0f, a, 0.0f} + // {0.0f, 0.0f, 0.0f, 1.0f} + + const float flip = flip_horizontaly ? -1 : 1; + // Matrix for optional horizontal flip around middle of output image. + // { fl , 0.0f, 0.0f, 0.0f} + // { 0.0f, 1.0f, 0.0f, 0.0f} + // { 0.0f, 0.0f, 1.0f, 0.0f} + // { 0.0f, 0.0f, 0.0f, 1.0f} + + const float c = std::cos(sub_rect.rotation); + const float d = std::sin(sub_rect.rotation); + // Matrix to do rotation around Z axis "rotate_matrix" + // { c, -d, 0.0f, 0.0f} + // { d, c, 0.0f, 0.0f} + // { 0.0f, 0.0f, 1.0f, 0.0f} + // { 0.0f, 0.0f, 0.0f, 1.0f} + + const float e = sub_rect.center_x; + const float f = sub_rect.center_y; + // Matrix to do X,Y translation of sub rect within parent rect + // "translate_matrix" + // {1.0f, 0.0f, 0.0f, e } + // {0.0f, 1.0f, 0.0f, f } + // {0.0f, 0.0f, 1.0f, 0.0f} + // {0.0f, 0.0f, 0.0f, 1.0f} + + const float g = 1.0f / rect_width; + const float h = 1.0f / rect_height; + // Matrix to scale X,Y,Z to [0.0, 1.0] range "post_scale_matrix" + // {g, 0.0f, 0.0f, 0.0f} + // {0.0f, h, 0.0f, 0.0f} + // {0.0f, 0.0f, g, 0.0f} + // {0.0f, 0.0f, 0.0f, 1.0f} + + // row 1 + matrix[0] = a * c * flip * g; + matrix[1] = -b * d * g; + matrix[2] = 0.0f; + matrix[3] = (-0.5f * a * c * flip + 0.5f * b * d + e) * g; + + // row 2 + matrix[4] = a * d * flip * h; + matrix[5] = b * c * h; + matrix[6] = 0.0f; + matrix[7] = (-0.5f * b * c - 0.5f * a * d * flip + f) * h; + + // row 3 + matrix[8] = 0.0f; + matrix[9] = 0.0f; + matrix[10] = a * g; + matrix[11] = 0.0f; + + // row 4 + matrix[12] = 0.0f; + matrix[13] = 0.0f; + matrix[14] = 0.0f; + matrix[15] = 1.0f; +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/image_to_tensor_utils.h b/mediapipe/calculators/tensor/image_to_tensor_utils.h new file mode 100644 index 0000000000..6cb7352568 --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_utils.h @@ -0,0 +1,82 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_UTILS_H_ +#define MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_UTILS_H_ + +#include + +#include "absl/types/optional.h" +#include "mediapipe/framework/formats/rect.pb.h" +#include "mediapipe/framework/port/statusor.h" + +namespace mediapipe { + +struct RotatedRect { + float center_x; + float center_y; + float width; + float height; + float rotation; +}; + +// Generates a new ROI or converts it from normalized rect. +RotatedRect GetRoi(int input_width, int input_height, + absl::optional norm_rect); + +// Pads ROI, so extraction happens correctly if aspect ratio is to be kept. +// Returns letterbox padding applied. +::mediapipe::StatusOr> PadRoi(int input_tensor_width, + int input_tensor_height, + bool keep_aspect_ratio, + RotatedRect* roi); + +// Represents a transformation of value which involves scaling and offsetting. +// To apply transformation: +// ValueTransformation transform = ... +// float transformed_value = transform.scale * value + transfrom.offset; +struct ValueTransformation { + float scale; + float offset; +}; + +// Returns value transformation to apply to a value in order to convert it from +// [from_range_min, from_range_max] into [to_range_min, to_range_max] range. +// from_range_min must be less than from_range_max +// to_range_min must be less than to_range_max +::mediapipe::StatusOr GetValueRangeTransformation( + float from_range_min, float from_range_max, float to_range_min, + float to_range_max); + +// Populates 4x4 "matrix" with row major order transformation matrix which +// maps (x, y) in range [0, 1] (describing points of @sub_rect) +// to (x', y') in range [0, 1]*** (describing points of a rect: +// [0, @rect_width] x [0, @rect_height] = RECT). +// +// *** (x', y') will go out of the range for points from @sub_rect +// which are not contained by RECT and it's expected behavior +// +// @sub_rect - rotated sub rect in absolute coordinates +// @rect_width - rect width +// @rect_height - rect height +// @flip_horizontaly - we need to flip the output buffer. +// @matrix - 4x4 matrix (array of 16 elements) to populate +void GetRotatedSubRectToRectTransformMatrix(const RotatedRect& sub_rect, + int rect_width, int rect_height, + bool flip_horizontaly, + std::array* matrix); + +} // namespace mediapipe + +#endif // MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_UTILS_H_ diff --git a/mediapipe/calculators/tensor/image_to_tensor_utils_test.cc b/mediapipe/calculators/tensor/image_to_tensor_utils_test.cc new file mode 100644 index 0000000000..e9baecc20f --- /dev/null +++ b/mediapipe/calculators/tensor/image_to_tensor_utils_test.cc @@ -0,0 +1,161 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/calculators/tensor/image_to_tensor_utils.h" + +#include "mediapipe/framework/formats/rect.pb.h" +#include "mediapipe/framework/port/gtest.h" +#include "mediapipe/framework/port/status_matchers.h" + +namespace mediapipe { +namespace { + +using ::testing::ElementsAre; +using ::testing::ElementsAreArray; + +testing::Matcher EqRotatedRect(float width, float height, + float center_x, float center_y, + float rotation) { + return testing::AllOf( + testing::Field(&RotatedRect::width, testing::FloatEq(width)), + testing::Field(&RotatedRect::height, testing::FloatEq(height)), + testing::Field(&RotatedRect::center_x, testing::FloatEq(center_x)), + testing::Field(&RotatedRect::center_y, testing::FloatEq(center_y)), + testing::Field(&RotatedRect::rotation, testing::FloatEq(rotation))); +} + +TEST(GetRoi, NoNormRect) { + EXPECT_THAT(GetRoi(4, 4, {}), EqRotatedRect(4, 4, 2, 2, 0)); + EXPECT_THAT(GetRoi(25, 15, {}), EqRotatedRect(25, 15, 12.5f, 7.5f, 0)); +} + +TEST(GetRoi, WholeImageNormRect) { + mediapipe::NormalizedRect norm_rect; + norm_rect.set_width(1.0f); + norm_rect.set_height(1.0f); + norm_rect.set_x_center(0.5f); + norm_rect.set_y_center(0.5f); + norm_rect.set_rotation(0.0f); + EXPECT_THAT(GetRoi(4, 4, norm_rect), EqRotatedRect(4, 4, 2, 2, 0)); + EXPECT_THAT(GetRoi(25, 15, norm_rect), EqRotatedRect(25, 15, 12.5f, 7.5f, 0)); +} + +TEST(GetRoi, ExpandedNormRect) { + mediapipe::NormalizedRect norm_rect; + norm_rect.set_width(4.0f); + norm_rect.set_height(2.0f); + norm_rect.set_x_center(0.5f); + norm_rect.set_y_center(1.0f); + norm_rect.set_rotation(3.0f); + EXPECT_THAT(GetRoi(4, 4, norm_rect), EqRotatedRect(16, 8, 2, 4, 3)); + EXPECT_THAT(GetRoi(25, 15, norm_rect), EqRotatedRect(100, 30, 12.5f, 15, 3)); +} + +TEST(PadRoi, NoPadding) { + RotatedRect roi{.center_x = 20, + .center_y = 10, + .width = 100, + .height = 200, + .rotation = 5}; + auto status_or_value = PadRoi(10, 10, /*keep_aspect_ratio=*/false, &roi); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), + ElementsAreArray({0.0f, 0.0f, 0.0f, 0.0f})); + EXPECT_THAT(roi, EqRotatedRect(100, 200, 20, 10, 5)); +} + +TEST(PadRoi, HorizontalPadding) { + RotatedRect roi{.center_x = 20, + .center_y = 10, + .width = 100, + .height = 200, + .rotation = 5}; + auto status_or_value = PadRoi(10, 10, /*keep_aspect_ratio=*/true, &roi); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), + ElementsAreArray({0.25f, 0.0f, 0.25f, 0.0f})); + EXPECT_THAT(roi, EqRotatedRect(200, 200, 20, 10, 5)); +} + +TEST(PadRoi, VerticalPadding) { + RotatedRect roi{ + .center_x = 1, .center_y = 2, .width = 21, .height = 19, .rotation = 3}; + const float expected_horizontal_padding = (21 - 19) / 2.0f / 21; + auto status_or_value = PadRoi(10, 10, /*keep_aspect_ratio=*/true, &roi); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT( + status_or_value.ValueOrDie(), + ElementsAre(testing::FloatEq(0.0f), + testing::FloatNear(expected_horizontal_padding, 1e-6), + testing::FloatEq(0.0f), + testing::FloatNear(expected_horizontal_padding, 1e-6))); + EXPECT_THAT(roi, EqRotatedRect(21, 21, 1, 2, 3)); +} + +testing::Matcher EqValueTransformation(float scale, + float offset) { + return ::testing::AllOf( + testing::Field(&ValueTransformation::scale, testing::FloatEq(scale)), + testing::Field(&ValueTransformation::offset, testing::FloatEq(offset))); +} + +TEST(GetValueRangeTransformation, PixelToFloatZeroCenter) { + auto status_or_value = GetValueRangeTransformation( + /*from_range_min=*/0.0f, /*from_range_max=*/255.0f, + /*to_range_min=*/-1.0f, /*to_range_max=*/1.0f); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), + EqValueTransformation(/*scale=*/2 / 255.0f, + /*offset=*/-1.0f)); +} + +TEST(GetValueRangeTransformation, PixelToFloat) { + auto status_or_value = GetValueRangeTransformation( + /*from_range_min=*/0.0f, /*from_range_max=*/255.0f, + /*to_range_min=*/0.0f, /*to_range_max=*/1.0f); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), + EqValueTransformation(/*scale=*/1 / 255.0f, + /*offset=*/0.0f)); +} + +TEST(GetValueRangeTransformation, FloatToFloatNoOp) { + auto status_or_value = GetValueRangeTransformation( + /*from_range_min=*/0.0f, /*from_range_max=*/1.0f, + /*to_range_min=*/0.0f, /*to_range_max=*/1.0f); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), + EqValueTransformation(/*scale=*/1.0f, /*offset=*/0.0f)); +} + +TEST(GetValueRangeTransformation, PixelToPixelNoOp) { + auto status_or_value = GetValueRangeTransformation( + /*from_range_min=*/0.0f, /*from_range_max=*/255.0f, + /*to_range_min=*/0.0f, /*to_range_max=*/255.0f); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), + EqValueTransformation(/*scale=*/1.0f, /*offset=*/0.0f)); +} + +TEST(GetValueRangeTransformation, FloatToPixel) { + auto status_or_value = GetValueRangeTransformation( + /*from_range_min=*/0.0f, /*from_range_max=*/1.0f, + /*to_range_min=*/0.0f, /*to_range_max=*/255.0f); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), + EqValueTransformation(/*scale=*/255.0f, /*offset=*/0.0f)); +} + +} // namespace +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/inference_calculator.cc b/mediapipe/calculators/tensor/inference_calculator.cc new file mode 100644 index 0000000000..dc02de1708 --- /dev/null +++ b/mediapipe/calculators/tensor/inference_calculator.cc @@ -0,0 +1,832 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "absl/memory/memory.h" +#include "mediapipe/calculators/tensor/inference_calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/util/tflite/config.h" + +#if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__) +#include "mediapipe/util/cpu_util.h" +#endif // !__EMSCRIPTEN__ || __EMSCRIPTEN_PTHREADS__ + +#include "mediapipe/util/resource_util.h" +#include "tensorflow/lite/error_reporter.h" +#include "tensorflow/lite/interpreter.h" +#include "tensorflow/lite/kernels/register.h" +#include "tensorflow/lite/model.h" + +#if defined(MEDIAPIPE_ANDROID) +#include "mediapipe/util/android/file/base/file.h" +#include "mediapipe/util/android/file/base/filesystem.h" +#include "mediapipe/util/android/file/base/helpers.h" +#endif // ANDROID + +#if MEDIAPIPE_TFLITE_GL_INFERENCE +#include "mediapipe/gpu/gl_calculator_helper.h" +#include "mediapipe/gpu/gpu_buffer.h" +#include "mediapipe/util/tflite/tflite_gpu_runner.h" +#include "tensorflow/lite/delegates/gpu/common/shape.h" +#include "tensorflow/lite/delegates/gpu/gl_delegate.h" +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE + +#if MEDIAPIPE_TFLITE_METAL_INFERENCE +#import +#import +#import + +#import "mediapipe/gpu/MPPMetalHelper.h" +#include "mediapipe/gpu/MPPMetalUtil.h" +#include "mediapipe/gpu/gpu_buffer.h" +#include "tensorflow/lite/delegates/gpu/common/shape.h" +#include "tensorflow/lite/delegates/gpu/metal/buffer_convert.h" +#include "tensorflow/lite/delegates/gpu/metal_delegate.h" +#include "tensorflow/lite/delegates/gpu/metal_delegate_internal.h" +#endif // MEDIAPIPE_TFLITE_METAL_INFERENCE + +#if !defined(MEDIAPIPE_EDGE_TPU) +#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h" +#endif // !EDGETPU +#if defined(MEDIAPIPE_ANDROID) +#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h" +#endif // ANDROID + +namespace { +// Commonly used to compute the number of blocks to launch in a kernel. +int NumGroups(const int size, const int group_size) { // NOLINT + return (size + group_size - 1) / group_size; +} + +// Round up n to next multiple of m. +template +T RoundUp(T n, T m) { + return ((n + m - T{1}) / m) * m; +} + +bool ShouldUseGpu(const mediapipe::InferenceCalculatorOptions& options) { + return ( + !options.has_delegate() || // Use GPU delegate if delegate not specified + (options.has_delegate() && options.delegate().has_gpu())); +} + +constexpr char kTensorsTag[] = "TENSORS"; +} // namespace + +#if defined(MEDIAPIPE_EDGE_TPU) +#include "edgetpu.h" + +// Creates and returns an Edge TPU interpreter to run the given edgetpu model. +std::unique_ptr BuildEdgeTpuInterpreter( + const tflite::FlatBufferModel& model, + tflite::ops::builtin::BuiltinOpResolver* resolver, + edgetpu::EdgeTpuContext* edgetpu_context) { + resolver->AddCustom(edgetpu::kCustomOp, edgetpu::RegisterCustomOp()); + std::unique_ptr interpreter; + if (tflite::InterpreterBuilder(model, *resolver)(&interpreter) != kTfLiteOk) { + std::cerr << "Failed to build edge TPU interpreter." << std::endl; + } + interpreter->SetExternalContext(kTfLiteEdgeTpuContext, edgetpu_context); + interpreter->SetNumThreads(1); + if (interpreter->AllocateTensors() != kTfLiteOk) { + std::cerr << "Failed to allocate edge TPU tensors." << std::endl; + } + return interpreter; +} +#endif // MEDIAPIPE_EDGE_TPU + +namespace mediapipe { + +#if MEDIAPIPE_TFLITE_METAL_INFERENCE +namespace { +tflite::gpu::BHWC BhwcFromTensorShape(const Tensor::Shape& shape) { + tflite::gpu::BHWC result; + result.b = shape.dims[0]; + switch (shape.dims.size()) { + case 1: + // result.b is already filled. + break; + case 2: + result.h = 1; + result.w = 1; + result.c = shape.dims[1]; + break; + case 3: + result.h = 1; + result.w = shape.dims[1]; + result.c = shape.dims[2]; + break; + case 4: + result.h = shape.dims[1]; + result.w = shape.dims[2]; + result.c = shape.dims[3]; + break; + default: + // Handles 0 and >4. + LOG(FATAL) + << "Dimensions size must be in range [1,4] for GPU inference, but " + << shape.dims.size() << " is provided"; + } + return result; +} +} // namespace +#endif // MEDIAPIPE_TFLITE_METAL_INFERENCE + +// Returns number of threads to configure XNNPACK delegate with. +// (Equal to user provided value if specified. Otherwise, it returns number of +// high cores (hard-coded to 1 for Emscripten without Threads extension)) +int GetXnnpackNumThreads(const mediapipe::InferenceCalculatorOptions& opts) { + static constexpr int kDefaultNumThreads = -1; + if (opts.has_delegate() && opts.delegate().has_xnnpack() && + opts.delegate().xnnpack().num_threads() != kDefaultNumThreads) { + return opts.delegate().xnnpack().num_threads(); + } +#if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__) + return InferHigherCoreIds().size(); +#else + return 1; +#endif // !__EMSCRIPTEN__ || __EMSCRIPTEN_PTHREADS__ +} + +// Calculator Header Section + +// Runs inference on the provided input Tensors and TFLite model. +// +// Creates an interpreter with given model and calls invoke(). +// Optionally run inference on CPU/GPU. +// +// This calculator can be used with TensorConverterCalculator to get the +// appropriate inputs. +// +// When the input tensors are on CPU, gpu inference is optional and can be +// specified in the calculator options. +// When the input tensors are on GPU, inference is GPU and output can be CPU or +// GPU. +// +// Input: +// TENSORS - Vector of Tensors +// +// Output: +// TENSORS - Vector of Tensors +// +// Input side packet: +// CUSTOM_OP_RESOLVER (optional) - Use a custom op resolver, +// instead of the builtin one. +// MODEL (optional) - Use to specify TfLite model +// (std::unique_ptr>) +// +// Example use: +// node { +// calculator: "InferenceCalculator" +// input_stream: "TENSORS:tensor_image" +// output_stream: "TENSORS:tensors" +// options: { +// [mediapipe.InferenceCalculatorOptions.ext] { +// model_path: "modelname.tflite" +// } +// } +// } +// +// or +// +// node { +// calculator: "InferenceCalculator" +// input_stream: "TENSORS:tensor_image" +// input_side_packet: "MODEL:model" +// output_stream: "TENSORS:tensors" +// options: { +// [mediapipe.InferenceCalculatorOptions.ext] { +// model_path: "modelname.tflite" +// delegate { gpu {} } +// } +// } +// } +// +// IMPORTANT Notes: +// Tensors are assumed to be ordered correctly (sequentially added to model). +// Input tensors are assumed to be of the correct size and already normalized. + +class InferenceCalculator : public CalculatorBase { + public: + using TfLiteDelegatePtr = + std::unique_ptr>; + using TfLiteModelPtr = + std::unique_ptr>; + + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + ::mediapipe::Status Close(CalculatorContext* cc) override; + + private: + ::mediapipe::Status ReadKernelsFromFile(); + ::mediapipe::Status WriteKernelsToFile(); + ::mediapipe::Status LoadModel(CalculatorContext* cc); + ::mediapipe::StatusOr GetModelAsPacket(const CalculatorContext& cc); + ::mediapipe::Status LoadDelegate(CalculatorContext* cc); + ::mediapipe::Status InitTFLiteGPURunner(CalculatorContext* cc); + + Packet model_packet_; + std::unique_ptr interpreter_; + TfLiteDelegatePtr delegate_; + +#if MEDIAPIPE_TFLITE_GL_INFERENCE + mediapipe::GlCalculatorHelper gpu_helper_; + std::unique_ptr tflite_gpu_runner_; +#elif MEDIAPIPE_TFLITE_METAL_INFERENCE + MPPMetalHelper* gpu_helper_ = nullptr; + TFLBufferConvert* converter_to_BPHWC4_ = nil; + TFLBufferConvert* converter_from_BPHWC4_ = nil; +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE + +#if MEDIAPIPE_TFLITE_GPU_SUPPORTED + std::vector output_shapes_; + std::vector> gpu_buffers_in_; + std::vector> gpu_buffers_out_; +#endif // MEDIAPIPE_TFLITE_GPU_SUPPORTED + +#if defined(MEDIAPIPE_EDGE_TPU) + std::shared_ptr edgetpu_context_ = + edgetpu::EdgeTpuManager::GetSingleton()->OpenDevice(); +#endif + + bool use_advanced_gpu_api_ = false; + bool use_gpu_delegate_ = false; + + bool use_kernel_caching_ = false; + std::string cached_kernel_filename_; +}; +REGISTER_CALCULATOR(InferenceCalculator); + +::mediapipe::Status InferenceCalculator::GetContract(CalculatorContract* cc) { + RET_CHECK(cc->Inputs().HasTag(kTensorsTag)); + cc->Inputs().Tag(kTensorsTag).Set>(); + RET_CHECK(cc->Outputs().HasTag(kTensorsTag)); + cc->Outputs().Tag(kTensorsTag).Set>(); + + const auto& options = cc->Options<::mediapipe::InferenceCalculatorOptions>(); + RET_CHECK(!options.model_path().empty() ^ + cc->InputSidePackets().HasTag("MODEL")) + << "Either model as side packet or model path in options is required."; + + if (cc->InputSidePackets().HasTag("CUSTOM_OP_RESOLVER")) { + cc->InputSidePackets() + .Tag("CUSTOM_OP_RESOLVER") + .Set(); + } + if (cc->InputSidePackets().HasTag("MODEL")) { + cc->InputSidePackets().Tag("MODEL").Set(); + } + + if (ShouldUseGpu(options)) { +#if MEDIAPIPE_TFLITE_GL_INFERENCE + MP_RETURN_IF_ERROR(mediapipe::GlCalculatorHelper::UpdateContract(cc)); +#elif MEDIAPIPE_TFLITE_METAL_INFERENCE + MP_RETURN_IF_ERROR([MPPMetalHelper updateContract:cc]); +#endif + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status InferenceCalculator::Open(CalculatorContext* cc) { + cc->SetOffset(TimestampDiff(0)); + +#if MEDIAPIPE_TFLITE_GL_INFERENCE || MEDIAPIPE_TFLITE_METAL_INFERENCE + const auto& options = cc->Options<::mediapipe::InferenceCalculatorOptions>(); + if (ShouldUseGpu(options)) { +#if MEDIAPIPE_TFLITE_GL_INFERENCE + use_advanced_gpu_api_ = options.has_delegate() && + options.delegate().has_gpu() && + options.delegate().gpu().use_advanced_gpu_api(); + use_kernel_caching_ = + use_advanced_gpu_api_ && options.delegate().gpu().use_kernel_caching(); +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE + use_gpu_delegate_ = !use_advanced_gpu_api_; + } +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE || MEDIAPIPE_TFLITE_METAL_INFERENCE + + if (use_kernel_caching_) { +#if MEDIAPIPE_TFLITE_GL_INFERENCE && defined(MEDIAPIPE_ANDROID) + cached_kernel_filename_ = + "/sdcard/" + mediapipe::File::Basename(options.model_path()) + ".ker"; +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE && MEDIAPIPE_ANDROID + } + + // When use_advanced_gpu_api_, model loading is handled in InitTFLiteGPURunner + // for everything. + if (!use_advanced_gpu_api_) { + MP_RETURN_IF_ERROR(LoadModel(cc)); + } + + if (use_gpu_delegate_ || use_advanced_gpu_api_) { +#if MEDIAPIPE_TFLITE_GL_INFERENCE + MP_RETURN_IF_ERROR(gpu_helper_.Open(cc)); + MP_RETURN_IF_ERROR( + gpu_helper_.RunInGlContext([this, &cc]() -> ::mediapipe::Status { + return use_advanced_gpu_api_ ? InitTFLiteGPURunner(cc) + : LoadDelegate(cc); + })); +#elif MEDIAPIPE_TFLITE_METAL_INFERENCE + gpu_helper_ = [[MPPMetalHelper alloc] initWithCalculatorContext:cc]; + RET_CHECK(gpu_helper_); + MP_RETURN_IF_ERROR(LoadDelegate(cc)); +#endif + } else { + MP_RETURN_IF_ERROR(LoadDelegate(cc)); + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status InferenceCalculator::Process(CalculatorContext* cc) { + if (cc->Inputs().Tag(kTensorsTag).IsEmpty()) { + return ::mediapipe::OkStatus(); + } + const auto& input_tensors = + cc->Inputs().Tag(kTensorsTag).Get>(); + RET_CHECK(!input_tensors.empty()); + auto output_tensors = absl::make_unique>(); + + if (use_gpu_delegate_ || use_advanced_gpu_api_) { +#if MEDIAPIPE_TFLITE_GL_INFERENCE + if (use_advanced_gpu_api_) { + MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext( + [this, &input_tensors, &output_tensors]() -> ::mediapipe::Status { + for (int i = 0; i < input_tensors.size(); ++i) { + MP_RETURN_IF_ERROR(tflite_gpu_runner_->BindSSBOToInputTensor( + input_tensors[i].GetOpenGlBufferReadView().name(), i)); + } + output_tensors->reserve(output_shapes_.size()); + for (int i = 0; i < output_shapes_.size(); ++i) { + output_tensors->emplace_back(Tensor::ElementType::kFloat32, + output_shapes_[i]); + MP_RETURN_IF_ERROR(tflite_gpu_runner_->BindSSBOToOutputTensor( + output_tensors->back().GetOpenGlBufferWriteView().name(), i)); + } + return ::mediapipe::OkStatus(); + })); + } else { + MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext( + [this, &input_tensors]() -> ::mediapipe::Status { + // Explicitly copy input. + for (int i = 0; i < input_tensors.size(); ++i) { + glBindBuffer(GL_COPY_READ_BUFFER, + input_tensors[i].GetOpenGlBufferReadView().name()); + glBindBuffer( + GL_COPY_WRITE_BUFFER, + gpu_buffers_in_[i]->GetOpenGlBufferWriteView().name()); + glCopyBufferSubData(GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, 0, + 0, input_tensors[i].bytes()); + } + return ::mediapipe::OkStatus(); + })); + } +#elif MEDIAPIPE_TFLITE_METAL_INFERENCE + // Explicit copy input with conversion float 32 bits to 16 bits. + id command_buffer = [gpu_helper_ commandBuffer]; + command_buffer.label = @"InferenceCalculatorConvert"; + id compute_encoder = + [command_buffer computeCommandEncoder]; + for (int i = 0; i < input_tensors.size(); ++i) { + auto input_view = input_tensors[i].GetMtlBufferReadView(command_buffer); + // Reshape tensor. + tflite::gpu::BHWC shape = BhwcFromTensorShape(input_tensors[i].shape()); + auto gpu_buffer_view = + gpu_buffers_in_[i]->GetMtlBufferWriteView(command_buffer); + [converter_to_BPHWC4_ convertWithEncoder:compute_encoder + shape:shape + sourceBuffer:input_view.buffer() + convertedBuffer:gpu_buffer_view.buffer()]; + } + [compute_encoder endEncoding]; + [command_buffer commit]; +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE + } else { + // Read CPU input into tensors. + for (int i = 0; i < input_tensors.size(); ++i) { + const Tensor* input_tensor = &input_tensors[i]; + auto input_tensor_view = input_tensor->GetCpuReadView(); + auto input_tensor_buffer = input_tensor_view.buffer(); + float* local_tensor_buffer = interpreter_->typed_input_tensor(i); + std::memcpy(local_tensor_buffer, input_tensor_buffer, + input_tensor->bytes()); + } + } + + // Run inference. +#if MEDIAPIPE_TFLITE_GL_INFERENCE + if (use_advanced_gpu_api_) { + RET_CHECK(tflite_gpu_runner_->Invoke().ok()); + } else { + RET_CHECK_EQ(interpreter_->Invoke(), kTfLiteOk); + } +#else + RET_CHECK_EQ(interpreter_->Invoke(), kTfLiteOk); +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE + + if (use_gpu_delegate_ || use_advanced_gpu_api_) { +#if MEDIAPIPE_TFLITE_GL_INFERENCE + if (use_gpu_delegate_) { + MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext( + [this, &output_tensors]() -> ::mediapipe::Status { + output_tensors->reserve(output_shapes_.size()); + for (int i = 0; i < output_shapes_.size(); ++i) { + const auto& t = gpu_buffers_out_[i]; + output_tensors->emplace_back(Tensor::ElementType::kFloat32, + gpu_buffers_out_[i]->shape()); + auto read_view = t->GetOpenGlBufferReadView(); + glBindBuffer(GL_COPY_READ_BUFFER, read_view.name()); + auto write_view = + output_tensors->back().GetOpenGlBufferWriteView(); + glBindBuffer(GL_COPY_WRITE_BUFFER, write_view.name()); + glCopyBufferSubData(GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, 0, + 0, t->bytes()); + } + return ::mediapipe::OkStatus(); + })); + } + // Output tensors are already bound if use_advanced_gpu_api_ is true. +#elif MEDIAPIPE_TFLITE_METAL_INFERENCE + id command_buffer = [gpu_helper_ commandBuffer]; + command_buffer.label = @"InferenceBPHWC4Convert"; + id convert_command = + [command_buffer computeCommandEncoder]; + output_tensors->reserve(output_shapes_.size()); + for (int i = 0; i < output_shapes_.size(); ++i) { + output_tensors->emplace_back(Tensor::ElementType::kFloat32, + output_shapes_[i]); + // Reshape tensor. + tflite::gpu::BHWC shape = BhwcFromTensorShape(output_shapes_[i]); + auto read_view = + gpu_buffers_out_[i]->GetMtlBufferReadView(command_buffer); + auto write_view = + output_tensors->at(i).GetMtlBufferWriteView(command_buffer); + [converter_from_BPHWC4_ convertWithEncoder:convert_command + shape:shape + sourceBuffer:read_view.buffer() + convertedBuffer:write_view.buffer()]; + } + [convert_command endEncoding]; + [command_buffer commit]; +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE + } else { + // Output result tensors (CPU). + const auto& tensor_indexes = interpreter_->outputs(); + output_tensors->reserve(tensor_indexes.size()); + for (int i = 0; i < tensor_indexes.size(); ++i) { + TfLiteTensor* tensor = interpreter_->tensor(tensor_indexes[i]); + output_tensors->emplace_back( + Tensor::ElementType::kFloat32, + Tensor::Shape{std::vector{ + tensor->dims->data, tensor->dims->data + tensor->dims->size}}); + auto cpu_view = output_tensors->back().GetCpuWriteView(); + std::memcpy(cpu_view.buffer(), tensor->data.f, + output_tensors->back().bytes()); + } + } + cc->Outputs() + .Tag(kTensorsTag) + .Add(output_tensors.release(), cc->InputTimestamp()); + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status InferenceCalculator::WriteKernelsToFile() { +#if MEDIAPIPE_TFLITE_GL_INFERENCE && defined(MEDIAPIPE_ANDROID) + if (use_kernel_caching_) { + // Save kernel file. + auto kernel_cache = absl::make_unique>( + tflite_gpu_runner_->GetSerializedBinaryCache()); + std::string cache_str(kernel_cache->begin(), kernel_cache->end()); + MP_RETURN_IF_ERROR( + mediapipe::file::SetContents(cached_kernel_filename_, cache_str)); + } +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE && MEDIAPIPE_ANDROID + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status InferenceCalculator::Close(CalculatorContext* cc) { + MP_RETURN_IF_ERROR(WriteKernelsToFile()); +#if MEDIAPIPE_TFLITE_GL_INFERENCE + if (use_gpu_delegate_) { + MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext([this]() -> Status { + gpu_buffers_in_.clear(); + gpu_buffers_out_.clear(); + return ::mediapipe::OkStatus(); + })); + } +#elif MEDIAPIPE_TFLITE_METAL_INFERENCE + converter_to_BPHWC4_ = nil; + converter_from_BPHWC4_ = nil; + gpu_buffers_in_.clear(); + gpu_buffers_out_.clear(); +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE + +#if defined(MEDIAPIPE_EDGE_TPU) + edgetpu_context_.reset(); +#endif + interpreter_ = nullptr; + delegate_ = nullptr; + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status InferenceCalculator::ReadKernelsFromFile() { +#if MEDIAPIPE_TFLITE_GL_INFERENCE && defined(MEDIAPIPE_ANDROID) + if (use_kernel_caching_) { + // Load pre-compiled kernel file. + if (mediapipe::File::Exists(cached_kernel_filename_)) { + std::string cache_str; + MP_RETURN_IF_ERROR( + mediapipe::file::GetContents(cached_kernel_filename_, &cache_str)); + std::vector cache_vec(cache_str.begin(), cache_str.end()); + tflite_gpu_runner_->SetSerializedBinaryCache(std::move(cache_vec)); + } + } +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE && MEDIAPIPE_ANDROID + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status InferenceCalculator::InitTFLiteGPURunner( + CalculatorContext* cc) { +#if MEDIAPIPE_TFLITE_GL_INFERENCE + ASSIGN_OR_RETURN(model_packet_, GetModelAsPacket(*cc)); + const auto& model = *model_packet_.Get(); + tflite::ops::builtin::BuiltinOpResolver op_resolver; + if (cc->InputSidePackets().HasTag("CUSTOM_OP_RESOLVER")) { + op_resolver = cc->InputSidePackets() + .Tag("CUSTOM_OP_RESOLVER") + .Get(); + } + + // Create runner + tflite::gpu::InferenceOptions options; + options.priority1 = tflite::gpu::InferencePriority::MIN_LATENCY; + options.priority2 = tflite::gpu::InferencePriority::AUTO; + options.priority3 = tflite::gpu::InferencePriority::AUTO; + options.usage = tflite::gpu::InferenceUsage::SUSTAINED_SPEED; + tflite_gpu_runner_ = std::make_unique(options); + MP_RETURN_IF_ERROR( + tflite_gpu_runner_->InitializeWithModel(model, op_resolver)); + + // Create and bind OpenGL buffers for outputs. + // The buffers are created once and their ids are passed to calculator outputs + output_shapes_.resize(tflite_gpu_runner_->outputs_size()); + for (int i = 0; i < tflite_gpu_runner_->outputs_size(); ++i) { + output_shapes_[i] = {tflite_gpu_runner_->GetOutputShapes()[i].b, + tflite_gpu_runner_->GetOutputShapes()[i].h, + tflite_gpu_runner_->GetOutputShapes()[i].w, + tflite_gpu_runner_->GetOutputShapes()[i].c}; + } + + MP_RETURN_IF_ERROR(ReadKernelsFromFile()); + + MP_RETURN_IF_ERROR(tflite_gpu_runner_->Build()); +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status InferenceCalculator::LoadModel(CalculatorContext* cc) { + ASSIGN_OR_RETURN(model_packet_, GetModelAsPacket(*cc)); + const auto& model = *model_packet_.Get(); + tflite::ops::builtin::BuiltinOpResolver op_resolver; + if (cc->InputSidePackets().HasTag("CUSTOM_OP_RESOLVER")) { + op_resolver = cc->InputSidePackets() + .Tag("CUSTOM_OP_RESOLVER") + .Get(); + } + +#if defined(MEDIAPIPE_EDGE_TPU) + interpreter_ = + BuildEdgeTpuInterpreter(model, &op_resolver, edgetpu_context_.get()); +#else + tflite::InterpreterBuilder(model, op_resolver)(&interpreter_); +#endif // MEDIAPIPE_EDGE_TPU + RET_CHECK(interpreter_); + +#if defined(__EMSCRIPTEN__) || defined(MEDIAPIPE_EDGE_TPU) + interpreter_->SetNumThreads(1); +#else + interpreter_->SetNumThreads( + cc->Options().cpu_num_thread()); +#endif // __EMSCRIPTEN__ + + RET_CHECK_EQ(interpreter_->AllocateTensors(), kTfLiteOk); + // TODO: Support quantized tensors. + CHECK(interpreter_->tensor(interpreter_->inputs()[0])->quantization.type != + kTfLiteAffineQuantization); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::StatusOr InferenceCalculator::GetModelAsPacket( + const CalculatorContext& cc) { + const auto& options = cc.Options(); + if (!options.model_path().empty()) { + std::string model_path = options.model_path(); + + ASSIGN_OR_RETURN(model_path, mediapipe::PathToResourceAsFile(model_path)); + + auto model = tflite::FlatBufferModel::BuildFromFile(model_path.c_str()); + RET_CHECK(model) << "Failed to load model from path."; + return MakePacket(TfLiteModelPtr( + model.release(), [](tflite::FlatBufferModel* model) { delete model; })); + } + if (cc.InputSidePackets().HasTag("MODEL")) { + return cc.InputSidePackets().Tag("MODEL"); + } + return ::mediapipe::Status( + ::mediapipe::StatusCode::kNotFound, + "Must specify TFLite model as path or loaded model."); +} + +::mediapipe::Status InferenceCalculator::LoadDelegate(CalculatorContext* cc) { + const auto& calculator_opts = + cc->Options(); + if (calculator_opts.has_delegate() && + calculator_opts.delegate().has_tflite()) { + // Default tflite inference requeqsted - no need to modify graph. + return ::mediapipe::OkStatus(); + } + + if (!use_gpu_delegate_) { +#if defined(MEDIAPIPE_ANDROID) + const bool nnapi_requested = calculator_opts.has_delegate() + ? calculator_opts.delegate().has_nnapi() + : calculator_opts.use_nnapi(); + if (nnapi_requested) { + // Attempt to use NNAPI. + // If not supported, the default CPU delegate will be created and used. + interpreter_->SetAllowFp16PrecisionForFp32(1); + delegate_ = + TfLiteDelegatePtr(tflite::NnApiDelegate(), [](TfLiteDelegate*) { + // No need to free according to tflite::NnApiDelegate() + // documentation. + }); + RET_CHECK_EQ(interpreter_->ModifyGraphWithDelegate(delegate_.get()), + kTfLiteOk); + return ::mediapipe::OkStatus(); + } +#endif // MEDIAPIPE_ANDROID + +#if defined(__EMSCRIPTEN__) + const bool xnnpack_requested = true; +#else + const bool xnnpack_requested = calculator_opts.has_delegate() && + calculator_opts.delegate().has_xnnpack(); +#endif // __EMSCRIPTEN__ + +#if !defined(MEDIAPIPE_EDGE_TPU) + if (xnnpack_requested) { + TfLiteXNNPackDelegateOptions xnnpack_opts{}; + xnnpack_opts.num_threads = GetXnnpackNumThreads(calculator_opts); + delegate_ = TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&xnnpack_opts), + &TfLiteXNNPackDelegateDelete); + RET_CHECK_EQ(interpreter_->ModifyGraphWithDelegate(delegate_.get()), + kTfLiteOk); + } +#endif // !EDGETPU + + // Return, no need for GPU delegate below. + return ::mediapipe::OkStatus(); + } else { +#if MEDIAPIPE_TFLITE_GL_INFERENCE + // Configure and create the delegate. + TfLiteGpuDelegateOptions options = TfLiteGpuDelegateOptionsDefault(); + options.compile_options.precision_loss_allowed = 1; + options.compile_options.preferred_gl_object_type = + TFLITE_GL_OBJECT_TYPE_FASTEST; + options.compile_options.dynamic_batch_enabled = 0; + options.compile_options.inline_parameters = 1; + delegate_ = TfLiteDelegatePtr(TfLiteGpuDelegateCreate(&options), + &TfLiteGpuDelegateDelete); + + // Get input image sizes. + const auto& input_indices = interpreter_->inputs(); + for (int i = 0; i < input_indices.size(); ++i) { + const TfLiteTensor* tensor = interpreter_->tensor(input_indices[i]); + gpu_buffers_in_.emplace_back(absl::make_unique( + Tensor::ElementType::kFloat32, + Tensor::Shape{std::vector{ + tensor->dims->data, tensor->dims->data + tensor->dims->size}})); + RET_CHECK_EQ( + TfLiteGpuDelegateBindBufferToTensor( + delegate_.get(), + gpu_buffers_in_.back()->GetOpenGlBufferWriteView().name(), + interpreter_->inputs()[i]), + kTfLiteOk); + } + interpreter_->SetAllowBufferHandleOutput(true); + // Get output image sizes. + const auto& output_indices = interpreter_->outputs(); + output_shapes_.resize(output_indices.size()); + // Create and bind output buffers. + for (int i = 0; i < output_shapes_.size(); ++i) { + const TfLiteTensor* tensor = interpreter_->tensor(output_indices[i]); + gpu_buffers_out_.emplace_back(absl::make_unique( + Tensor::ElementType::kFloat32, + Tensor::Shape{std::vector{ + tensor->dims->data, tensor->dims->data + tensor->dims->size}})); + RET_CHECK_EQ( + TfLiteGpuDelegateBindBufferToTensor( + delegate_.get(), + gpu_buffers_out_.back()->GetOpenGlBufferWriteView().name(), + output_indices[i]), + kTfLiteOk); + } + + // Must call this last. + RET_CHECK_EQ(interpreter_->ModifyGraphWithDelegate(delegate_.get()), + kTfLiteOk); +#elif MEDIAPIPE_TFLITE_METAL_INFERENCE + // Configure and create the delegate. + TFLGpuDelegateOptions options; + options.allow_precision_loss = true; + options.wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive; + delegate_ = TfLiteDelegatePtr(TFLGpuDelegateCreate(&options), + &TFLGpuDelegateDelete); + RET_CHECK_EQ(interpreter_->ModifyGraphWithDelegate(delegate_.get()), + kTfLiteOk); + id device = gpu_helper_.mtlDevice; + + // Get input image sizes. + const auto& input_indices = interpreter_->inputs(); + for (int i = 0; i < input_indices.size(); ++i) { + const TfLiteTensor* tensor = interpreter_->tensor(input_indices[i]); + // Create and bind input buffer. + std::vector dims{tensor->dims->data, + tensor->dims->data + tensor->dims->size}; + dims.back() = RoundUp(dims.back(), 4); + gpu_buffers_in_.emplace_back(absl::make_unique( + Tensor::ElementType::kFloat16, Tensor::Shape{dims})); + auto buffer_view = + gpu_buffers_in_[i]->GetMtlBufferWriteView(gpu_helper_.mtlDevice); + RET_CHECK_EQ(TFLGpuDelegateBindMetalBufferToTensor( + delegate_.get(), input_indices[i], buffer_view.buffer()), + true); + } + + interpreter_->SetAllowBufferHandleOutput(true); + // Get output image sizes. + const auto& output_indices = interpreter_->outputs(); + output_shapes_.resize(output_indices.size()); + for (int i = 0; i < output_shapes_.size(); ++i) { + const TfLiteTensor* tensor = interpreter_->tensor(output_indices[i]); + RET_CHECK(tensor->dims->size <= 4); + // Create and bind output buffers. + // Channels are always padded to multiple of 4. + std::vector dims{tensor->dims->data, + tensor->dims->data + tensor->dims->size}; + output_shapes_[i] = {dims}; + dims.back() = RoundUp(dims.back(), 4); + gpu_buffers_out_.emplace_back(absl::make_unique( + Tensor::ElementType::kFloat16, Tensor::Shape{dims})); + RET_CHECK_EQ(TFLGpuDelegateBindMetalBufferToTensor( + delegate_.get(), output_indices[i], + gpu_buffers_out_[i] + ->GetMtlBufferWriteView(gpu_helper_.mtlDevice) + .buffer()), + true); + } + + // Create converter for GPU input. + converter_to_BPHWC4_ = [[TFLBufferConvert alloc] initWithDevice:device + isFloat16:true + convertToPBHWC4:true]; + if (converter_to_BPHWC4_ == nil) { + return mediapipe::InternalError( + "Error initializating input buffer converter"); + } + // Create converter for GPU output. + converter_from_BPHWC4_ = [[TFLBufferConvert alloc] initWithDevice:device + isFloat16:true + convertToPBHWC4:false]; + if (converter_from_BPHWC4_ == nil) { + return mediapipe::InternalError( + "Error initializating output buffer converter"); + } +#endif // MEDIAPIPE_TFLITE_GL_INFERENCE + } + + return ::mediapipe::OkStatus(); +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/inference_calculator.proto b/mediapipe/calculators/tensor/inference_calculator.proto new file mode 100644 index 0000000000..07201f9d51 --- /dev/null +++ b/mediapipe/calculators/tensor/inference_calculator.proto @@ -0,0 +1,111 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +// Full Example: +// +// node { +// calculator: "InferenceCalculator" +// input_stream: "TENSOR_IN:image_tensors" +// output_stream: "TENSOR_OUT:result_tensors" +// options { +// [mediapipe.InferenceCalculatorOptions.ext] { +// model_path: "model.tflite" +// delegate { gpu {} } +// } +// } +// } +// +message InferenceCalculatorOptions { + extend mediapipe.CalculatorOptions { + optional InferenceCalculatorOptions ext = 336783863; + } + + message Delegate { + // Default inference provided by tflite. + message TfLite {} + // Delegate to run GPU inference depending on the device. + // (Can use OpenGl, OpenCl, Metal depending on the device.) + message Gpu { + // Experimental, Android/Linux only. Use TFLite GPU delegate API2 for + // the NN inference. + // example: + // delegate: { gpu { use_advanced_gpu_api: true } } + optional bool use_advanced_gpu_api = 1 [default = false]; + + // This option is valid for TFLite GPU delegate API2 only, + // Choose any of available APIs to force running inference using it. + enum API { + ANY = 0; + OPENGL = 1; + OPENCL = 2; + } + optional API api = 4 [default = ANY]; + + // This option is valid for TFLite GPU delegate API2 only, + // Set to true to use 16-bit float precision. If max precision is needed, + // set to false for 32-bit float calculations only. + optional bool allow_precision_loss = 3 [default = true]; + + // Load pre-compiled serialized binary cache to accelerate init process. + // Only available for OpenCL delegate on Android. + optional bool use_kernel_caching = 2 [default = false]; + } + // Android only. + message Nnapi {} + message Xnnpack { + // Number of threads for XNNPACK delegate. (By default, calculator tries + // to choose optimal number of threads depending on the device.) + optional int32 num_threads = 1 [default = -1]; + } + + oneof delegate { + TfLite tflite = 1; + Gpu gpu = 2; + Nnapi nnapi = 3; + Xnnpack xnnpack = 4; + } + } + + // Path to the TF Lite model (ex: /path/to/modelname.tflite). + // On mobile, this is generally just modelname.tflite. + optional string model_path = 1; + + // Whether the TF Lite GPU or CPU backend should be used. Effective only when + // input tensors are on CPU. For input tensors on GPU, GPU backend is always + // used. + // DEPRECATED: configure "delegate" instead. + optional bool use_gpu = 2 [deprecated = true, default = false]; + + // Android only. When true, an NNAPI delegate will be used for inference. + // If NNAPI is not available, then the default CPU delegate will be used + // automatically. + // DEPRECATED: configure "delegate" instead. + optional bool use_nnapi = 3 [deprecated = true, default = false]; + + // The number of threads available to the interpreter. Effective only when + // input tensors are on CPU and 'use_gpu' is false. + optional int32 cpu_num_thread = 4 [default = -1]; + + // TfLite delegate to run inference. + // NOTE: calculator is free to choose delegate if not specified explicitly. + // NOTE: use_gpu/use_nnapi are ignored if specified. (Delegate takes + // precedence over use_* deprecated options.) + optional Delegate delegate = 5; +} diff --git a/mediapipe/calculators/tensor/inference_calculator_test.cc b/mediapipe/calculators/tensor/inference_calculator_test.cc new file mode 100644 index 0000000000..248d799e52 --- /dev/null +++ b/mediapipe/calculators/tensor/inference_calculator_test.cc @@ -0,0 +1,162 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "absl/strings/str_replace.h" +#include "absl/strings/string_view.h" +#include "mediapipe/calculators/tensor/inference_calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/calculator_runner.h" +#include "mediapipe/framework/deps/file_path.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port/gmock.h" +#include "mediapipe/framework/port/gtest.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/parse_text_proto.h" +#include "mediapipe/framework/port/status_matchers.h" // NOLINT +#include "mediapipe/framework/tool/validate_type.h" +#include "tensorflow/lite/error_reporter.h" +#include "tensorflow/lite/kernels/register.h" +#include "tensorflow/lite/model.h" + +#ifdef __APPLE__ +#include +#endif // defined(__APPLE__) + +namespace mediapipe { + +using ::tflite::Interpreter; + +void DoSmokeTest(const std::string& graph_proto) { + const int width = 8; + const int height = 8; + const int channels = 3; + // Prepare input tensor. + auto input_vec = absl::make_unique>(); + input_vec->emplace_back(Tensor::ElementType::kFloat32, + Tensor::Shape{1, height, width, channels}); + { + auto view1 = input_vec->back().GetCpuWriteView(); + auto tensor_buffer = view1.buffer(); + ASSERT_NE(tensor_buffer, nullptr); + for (int i = 0; i < width * height * channels - 1; i++) { + tensor_buffer[i] = 1; + } + } + + // Prepare single calculator graph to and wait for packets. + CalculatorGraphConfig graph_config = + ParseTextProtoOrDie(graph_proto); + std::vector output_packets; + tool::AddVectorSink("tensor_out", &graph_config, &output_packets); + CalculatorGraph graph(graph_config); + MP_ASSERT_OK(graph.StartRun({})); + + // Push the tensor into the graph. + MP_ASSERT_OK(graph.AddPacketToInputStream( + "tensor_in", Adopt(input_vec.release()).At(Timestamp(0)))); + // Wait until the calculator done processing. + MP_ASSERT_OK(graph.WaitUntilIdle()); + ASSERT_EQ(1, output_packets.size()); + + // Get and process results. + const std::vector& result_vec = + output_packets[0].Get>(); + ASSERT_EQ(1, result_vec.size()); + + const Tensor& result = result_vec[0]; + auto view = result.GetCpuReadView(); + auto result_buffer = view.buffer(); + ASSERT_NE(result_buffer, nullptr); + for (int i = 0; i < width * height * channels - 1; i++) { + ASSERT_EQ(3, result_buffer[i]); + } + + // Fully close graph at end, otherwise calculator+tensors are destroyed + // after calling WaitUntilDone(). + MP_ASSERT_OK(graph.CloseInputStream("tensor_in")); + MP_ASSERT_OK(graph.WaitUntilDone()); +} + +// Tests a simple add model that adds an input tensor to itself. +TEST(InferenceCalculatorTest, SmokeTest) { + std::string graph_proto = R"( + input_stream: "tensor_in" + node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:tensor_in" + output_stream: "TENSORS:tensor_out" + options { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/calculators/tensor/testdata/add.bin" + $delegate + } + } + } + )"; + // Test CPU inference only. + DoSmokeTest(/*graph_proto=*/absl::StrReplaceAll( + graph_proto, {{"$delegate", "delegate { tflite {} }"}})); + DoSmokeTest(/*graph_proto=*/absl::StrReplaceAll( + graph_proto, {{"$delegate", "delegate { xnnpack {} }"}})); + DoSmokeTest(/*graph_proto=*/absl::StrReplaceAll( + graph_proto, + {{"$delegate", "delegate { xnnpack { num_threads: 10 } }"}})); +} + +TEST(InferenceCalculatorTest, SmokeTest_ModelAsInputSidePacket) { + std::string graph_proto = R"( + input_stream: "tensor_in" + + node { + calculator: "ConstantSidePacketCalculator" + output_side_packet: "PACKET:model_path" + options: { + [mediapipe.ConstantSidePacketCalculatorOptions.ext]: { + packet { string_value: "mediapipe/calculators/tensor/testdata/add.bin" } + } + } + } + + node { + calculator: "LocalFileContentsCalculator" + input_side_packet: "FILE_PATH:model_path" + output_side_packet: "CONTENTS:model_blob" + } + + node { + calculator: "TfLiteModelCalculator" + input_side_packet: "MODEL_BLOB:model_blob" + output_side_packet: "MODEL:model" + } + + node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:tensor_in" + output_stream: "TENSORS:tensor_out" + input_side_packet: "MODEL:model" + options { + [mediapipe.InferenceCalculatorOptions.ext] { + delegate { tflite {} } + } + } + } + )"; + DoSmokeTest(graph_proto); +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/tensor_converter_calculator.cc b/mediapipe/calculators/tensor/tensor_converter_calculator.cc new file mode 100644 index 0000000000..9ac246d5a8 --- /dev/null +++ b/mediapipe/calculators/tensor/tensor_converter_calculator.cc @@ -0,0 +1,676 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "mediapipe/calculators/tensor/tensor_converter_calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/image_frame.h" +#include "mediapipe/framework/formats/matrix.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/util/resource_util.h" + +#if !MEDIAPIPE_DISABLE_GPU +#include "mediapipe/gpu/gpu_buffer.h" +#if MEDIAPIPE_METAL_ENABLED +#import +#import +#import + +#import "mediapipe/gpu/MPPMetalHelper.h" +#elif MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 +#include "mediapipe/gpu/gl_calculator_helper.h" +#if MEDIAPIPE_OPENGL_ES_VERSION < MEDIAPIPE_OPENGL_ES_31 +#include "mediapipe/gpu/gl_simple_shaders.h" +#include "mediapipe/gpu/shader_util.h" +#endif // MEDIAPIPE_OPENGL_ES_VERSION < MEDIAPIPE_OPENGL_ES_31 +#endif // MEDIAPIPE_METAL_ENABLED +#endif // !MEDIAPIPE_DISABLE_GPU + +namespace { +constexpr int kWorkgroupSize = 8; // Block size for GPU shader. +// Commonly used to compute the number of blocks to launch in a kernel. +int NumGroups(const int size, const int group_size) { // NOLINT + return (size + group_size - 1) / group_size; +} + +typedef Eigen::Matrix + RowMajorMatrixXf; +typedef Eigen::Matrix + ColMajorMatrixXf; + +constexpr char kImageFrameTag[] = "IMAGE"; +constexpr char kGpuBufferTag[] = "IMAGE_GPU"; +constexpr char kTensorsTag[] = "TENSORS"; +constexpr char kMatrixTag[] = "MATRIX"; +} // namespace + +namespace mediapipe { + +// Calculator for normalizing and converting an ImageFrame, GpuBuffer or Matrix +// into a Tensor. +// +// This calculator is designed to be used with the TfLiteInferenceCalcualtor, +// as a pre-processing step for calculator inputs. +// +// IMAGE and IMAGE_GPU inputs are normalized to [-1,1] (default) or [0,1], +// specified by options (unless outputting a quantized tensor). +// +// Input: +// One of the following tags: +// IMAGE - ImageFrame (assumed to be 8-bit or 32-bit data). +// IMAGE_GPU - GpuBuffer (assumed to be RGBA or RGB GL texture). +// MATRIX - Matrix. +// +// Output: +// One of the following tags: +// TENSORS - Vector of Tensors of type kFloat32. The resource type used: +// - MTLBuffer if Metal API is available +// - SSBO if Metal is unavailable and OpenGL ES 3.1 is available +// - Texture2D if Metal and GLES 3.1 are not available and GLES 3.0 is. +// +// Example use: +// node { +// calculator: "TensorConverterCalculator" +// input_stream: "IMAGE:input_image" +// output_stream: "TENSORS:image_tensor" +// options: { +// [mediapipe.TensorConverterCalculatorOptions.ext] { +// zero_center: true +// } +// } +// } +// +// IMPORTANT Notes: +// GPU tensors are currently only supported on mobile platforms. + +class TensorConverterCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + ::mediapipe::Status Close(CalculatorContext* cc) override; + + private: + ::mediapipe::Status InitGpu(CalculatorContext* cc); + ::mediapipe::Status LoadOptions(CalculatorContext* cc); + template + ::mediapipe::Status NormalizeImage(const ImageFrame& image_frame, + bool flip_vertically, float* tensor_ptr); + ::mediapipe::Status CopyMatrixToTensor(const Matrix& matrix, + float* tensor_ptr); + ::mediapipe::Status ProcessCPU(CalculatorContext* cc); + ::mediapipe::Status ProcessGPU(CalculatorContext* cc); + +#if MEDIAPIPE_METAL_ENABLED + MPPMetalHelper* gpu_helper_ = nullptr; + id to_buffer_program_; +#elif MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + mediapipe::GlCalculatorHelper gpu_helper_; +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + GLuint to_buffer_program_; +#else + enum { ATTRIB_VERTEX, ATTRIB_TEXTURE_POSITION, NUM_ATTRIBUTES }; + GLuint to_tex2d_program_; + GLuint framebuffer_; +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 +#endif // MEDIAPIPE_METAL_ENABLED + + bool initialized_ = false; + bool use_gpu_ = false; + absl::optional> output_range_; + bool flip_vertically_ = false; + bool row_major_matrix_ = false; + int max_num_channels_ = 3; +}; +REGISTER_CALCULATOR(TensorConverterCalculator); + +::mediapipe::Status TensorConverterCalculator::GetContract( + CalculatorContract* cc) { + // Confirm only one of the input streams is present. + RET_CHECK(static_cast(cc->Inputs().HasTag(kImageFrameTag)) + + static_cast(cc->Inputs().HasTag(kGpuBufferTag)) + + static_cast(cc->Inputs().HasTag(kMatrixTag)) == + 1); + + if (cc->Inputs().HasTag(kImageFrameTag)) { + cc->Inputs().Tag(kImageFrameTag).Set(); + } + if (cc->Inputs().HasTag(kMatrixTag)) { + cc->Inputs().Tag(kMatrixTag).Set(); + } + +#if !MEDIAPIPE_DISABLE_GPU + if (cc->Inputs().HasTag(kGpuBufferTag)) { + cc->Inputs().Tag(kGpuBufferTag).Set(); +#if MEDIAPIPE_METAL_ENABLED + MP_RETURN_IF_ERROR([MPPMetalHelper updateContract:cc]); +#elif MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + MP_RETURN_IF_ERROR(mediapipe::GlCalculatorHelper::UpdateContract(cc)); +#endif // MEDIAPIPE_METAL_ENABLED + } +#endif // !MEDIAPIPE_DISABLE_GPU + + RET_CHECK(cc->Outputs().HasTag(kTensorsTag)); + cc->Outputs().Tag(kTensorsTag).Set>(); + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorConverterCalculator::Open(CalculatorContext* cc) { + cc->SetOffset(TimestampDiff(0)); + + MP_RETURN_IF_ERROR(LoadOptions(cc)); + +#if !MEDIAPIPE_DISABLE_GPU + if (cc->Inputs().HasTag(kGpuBufferTag)) { + use_gpu_ = true; +#if MEDIAPIPE_METAL_ENABLED + gpu_helper_ = [[MPPMetalHelper alloc] initWithCalculatorContext:cc]; + RET_CHECK(gpu_helper_); +#elif MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + MP_RETURN_IF_ERROR(gpu_helper_.Open(cc)); +#endif // MEDIAPIPE_METAL_ENABLED + } +#endif // !MEDIAPIPE_DISABLE_GPU + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorConverterCalculator::Process(CalculatorContext* cc) { + if (use_gpu_) { + if (cc->Inputs().Tag(kGpuBufferTag).IsEmpty()) { + return ::mediapipe::OkStatus(); + } + // Convert to GPU tensors type. + MP_RETURN_IF_ERROR(ProcessGPU(cc)); + } else { + // Convert to CPU tensors or Matrix type. + MP_RETURN_IF_ERROR(ProcessCPU(cc)); + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorConverterCalculator::Close(CalculatorContext* cc) { +#if !MEDIAPIPE_DISABLE_GPU + if (use_gpu_) { +#if MEDIAPIPE_METAL_ENABLED + to_buffer_program_ = nil; +#elif MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + gpu_helper_.RunInGlContext([this] { +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + glDeleteProgram(to_buffer_program_); +#else + glDeleteFramebuffers(1, &framebuffer_); + glDeleteProgram(to_tex2d_program_); +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + }); +#endif // MEDIAPIPE_METAL_ENABLED + } +#endif // !MEDIAPIPE_DISABLE_GPU + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorConverterCalculator::ProcessCPU( + CalculatorContext* cc) { + auto output_tensors = absl::make_unique>(); + if (cc->Inputs().HasTag(kImageFrameTag)) { + if (cc->Inputs().Tag(kImageFrameTag).IsEmpty()) { + return ::mediapipe::OkStatus(); + } + const auto& image_frame = + cc->Inputs().Tag(kImageFrameTag).Get(); + const int height = image_frame.Height(); + const int width = image_frame.Width(); + const int channels = image_frame.NumberOfChannels(); + const int channels_preserved = std::min(channels, max_num_channels_); + const mediapipe::ImageFormat::Format format = image_frame.Format(); + + if (!(format == mediapipe::ImageFormat::SRGBA || + format == mediapipe::ImageFormat::SRGB || + format == mediapipe::ImageFormat::GRAY8 || + format == mediapipe::ImageFormat::VEC32F1)) + RET_CHECK_FAIL() << "Unsupported CPU input format."; + + output_tensors->emplace_back( + Tensor::ElementType::kFloat32, + Tensor::Shape{1, height, width, channels_preserved}); + auto cpu_view = output_tensors->back().GetCpuWriteView(); + + // Copy image data into tensor. + if (image_frame.ByteDepth() == 1) { + MP_RETURN_IF_ERROR(NormalizeImage(image_frame, flip_vertically_, + cpu_view.buffer())); + } else if (image_frame.ByteDepth() == 4) { + MP_RETURN_IF_ERROR(NormalizeImage(image_frame, flip_vertically_, + cpu_view.buffer())); + } else { + return ::mediapipe::InternalError( + "Only byte-based (8 bit) and float (32 bit) images supported."); + } + } else if (cc->Inputs().HasTag(kMatrixTag)) { + if (cc->Inputs().Tag(kMatrixTag).IsEmpty()) { + return ::mediapipe::OkStatus(); + } + const auto& matrix = cc->Inputs().Tag(kMatrixTag).Get(); + const int height = matrix.rows(); + const int width = matrix.cols(); + const int channels = 1; + output_tensors->emplace_back(Tensor::ElementType::kFloat32, + Tensor::Shape{1, height, width, channels}); + MP_RETURN_IF_ERROR(CopyMatrixToTensor( + matrix, output_tensors->back().GetCpuWriteView().buffer())); + } else { + return ::mediapipe::OkStatus(); + } + cc->Outputs() + .Tag(kTensorsTag) + .Add(output_tensors.release(), cc->InputTimestamp()); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorConverterCalculator::ProcessGPU( + CalculatorContext* cc) { +#if !MEDIAPIPE_DISABLE_GPU + if (!initialized_) { + MP_RETURN_IF_ERROR(InitGpu(cc)); + initialized_ = true; + } + const auto& input = + cc->Inputs().Tag(kGpuBufferTag).Get(); + int width = input.width(); + int height = input.height(); + int channels = max_num_channels_; + auto output_tensors = absl::make_unique>(); + output_tensors->emplace_back(Tensor::ElementType::kFloat32, + Tensor::Shape{1, height, width, channels}); +#if MEDIAPIPE_METAL_ENABLED + id device = gpu_helper_.mtlDevice; + id command_buffer = [gpu_helper_ commandBuffer]; + command_buffer.label = @"TensorConverterCalculatorConvert"; + id compute_encoder = + [command_buffer computeCommandEncoder]; + [compute_encoder setComputePipelineState:to_buffer_program_]; + id src_texture = [gpu_helper_ metalTextureWithGpuBuffer:input]; + [compute_encoder setTexture:src_texture atIndex:0]; + auto output_view = + output_tensors->at(0).GetMtlBufferWriteView(command_buffer); + [compute_encoder setBuffer:output_view.buffer() offset:0 atIndex:1]; + MTLSize threads_per_group = MTLSizeMake(kWorkgroupSize, kWorkgroupSize, 1); + MTLSize threadgroups = + MTLSizeMake(NumGroups(input.width(), kWorkgroupSize), + NumGroups(input.height(), kWorkgroupSize), 1); + [compute_encoder dispatchThreadgroups:threadgroups + threadsPerThreadgroup:threads_per_group]; + [compute_encoder endEncoding]; + [command_buffer commit]; +#elif MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext( + [this, &output_tensors, &input]() -> ::mediapipe::Status { + auto src = gpu_helper_.CreateSourceTexture(input); +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + // Convert GL texture into SSBO. + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, src.name()); + auto output_view = output_tensors->back().GetOpenGlBufferWriteView(); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, output_view.name()); + glUseProgram(to_buffer_program_); + glDispatchCompute(NumGroups(input.width(), kWorkgroupSize), + NumGroups(input.height(), kWorkgroupSize), 1); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); + glBindTexture(GL_TEXTURE_2D, 0); +#else + // Texture2D -> Texture2D with OpenGL ES 3.0. + glUseProgram(to_tex2d_program_); + glDisable(GL_DEPTH_TEST); + glBindFramebuffer(GL_FRAMEBUFFER, framebuffer_); + glViewport(0, 0, src.width(), src.height()); + glActiveTexture(GL_TEXTURE0); + auto output_view = output_tensors->back().GetOpenGlTexture2dWriteView(); + glBindTexture(GL_TEXTURE_2D, output_view.name()); + glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + GL_TEXTURE_2D, output_view.name(), 0); + glActiveTexture(GL_TEXTURE1); + glBindTexture(src.target(), src.name()); + glVertexAttribPointer(ATTRIB_VERTEX, 2, GL_FLOAT, 0, 0, + mediapipe::kBasicSquareVertices); + glEnableVertexAttribArray(ATTRIB_VERTEX); + glVertexAttribPointer(ATTRIB_TEXTURE_POSITION, 2, GL_FLOAT, 0, 0, + mediapipe::kBasicTextureVertices); + glEnableVertexAttribArray(ATTRIB_TEXTURE_POSITION); + + // draw + glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); + + // cleanup + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, 0); + glActiveTexture(GL_TEXTURE1); + glBindTexture(GL_TEXTURE_2D, 0); +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + src.Release(); + return ::mediapipe::OkStatus(); + })); +#endif // MEDIAPIPE_METAL_ENABLED + cc->Outputs() + .Tag(kTensorsTag) + .Add(output_tensors.release(), cc->InputTimestamp()); +#else + RET_CHECK_FAIL() << "GPU processing is not enabled."; +#endif // !MEDIAPIPE_DISABLE_GPU + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorConverterCalculator::InitGpu(CalculatorContext* cc) { +#if !MEDIAPIPE_DISABLE_GPU + // Get input image sizes. + const auto& input = + cc->Inputs().Tag(kGpuBufferTag).Get(); + mediapipe::ImageFormat::Format format = + mediapipe::ImageFormatForGpuBufferFormat(input.format()); + const bool include_alpha = (max_num_channels_ == 4); + const bool single_channel = (max_num_channels_ == 1); + if (!(format == mediapipe::ImageFormat::GRAY8 || + format == mediapipe::ImageFormat::SRGB || + format == mediapipe::ImageFormat::SRGBA)) + RET_CHECK_FAIL() << "Unsupported GPU input format."; + if (include_alpha && (format != mediapipe::ImageFormat::SRGBA)) + RET_CHECK_FAIL() << "Num input channels is less than desired output."; + +#if MEDIAPIPE_METAL_ENABLED + id device = gpu_helper_.mtlDevice; + // Shader to convert GL Texture to Metal Buffer, + // with normalization to either: [0,1] or [-1,1]. + const std::string shader_source = absl::Substitute( + R"( + #include + + using namespace metal; + + kernel void convertKernel( + texture2d in_tex [[ texture(0) ]], + device float* out_buf [[ buffer(1) ]], + uint2 gid [[ thread_position_in_grid ]]) { + if (gid.x >= in_tex.get_width() || gid.y >= in_tex.get_height()) return; + constexpr sampler texture_sampler(coord::pixel, address::clamp_to_edge); + const float2 coord = float2(gid.x, gid.y); + half4 pixel = in_tex.sample(texture_sampler, coord); + $0 // normalize [-1,1] + const int linear_index = $1 * ($2 * in_tex.get_width() + gid.x); + out_buf[linear_index + 0] = pixel.x; + $3 // g & b channels + $4 // alpha channel + } + )", + /*$0=*/ + output_range_.has_value() + ? absl::Substitute("pixel = pixel * half($0) + half($1);", + (output_range_->second - output_range_->first), + output_range_->first) + : "", + /*$1=*/max_num_channels_, + /*$2=*/flip_vertically_ ? "(in_tex.get_height() - 1 - gid.y)" : "gid.y", + /*$3=*/ + single_channel ? "" : R"(out_buf[linear_index + 1] = pixel.y; + out_buf[linear_index + 2] = pixel.z;)", + /*$4=*/include_alpha ? "out_buf[linear_index + 3] = pixel.w;" : ""); + + NSString* library_source = + [NSString stringWithUTF8String:shader_source.c_str()]; + NSError* error = nil; + id library = + [device newLibraryWithSource:library_source options:nullptr error:&error]; + RET_CHECK(library != nil) << "Couldn't create shader library " + << [[error localizedDescription] UTF8String]; + id kernel_func = nil; + kernel_func = [library newFunctionWithName:@"convertKernel"]; + RET_CHECK(kernel_func != nil) << "Couldn't create kernel function."; + to_buffer_program_ = + [device newComputePipelineStateWithFunction:kernel_func error:&error]; + RET_CHECK(to_buffer_program_ != nil) << "Couldn't create pipeline state " << + [[error localizedDescription] UTF8String]; +#elif MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext([this, &include_alpha, +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + &input, +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + &single_channel]() + -> ::mediapipe::Status { +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + // Shader to convert GL Texture to Shader Storage Buffer Object (SSBO), + // with normalization to either: [0,1] or [-1,1]. + const std::string shader_source = absl::Substitute( + R"( #version 310 es + layout(local_size_x = $0, local_size_y = $0) in; + layout(binding = 0) uniform sampler2D input_texture; + layout(std430, binding = 1) buffer Output {float elements[];} output_data; + ivec2 width_height = ivec2($1, $2); + void main() { + ivec2 gid = ivec2(gl_GlobalInvocationID.xy); + if (gid.x >= width_height.x || gid.y >= width_height.y) return; + vec4 pixel = texelFetch(input_texture, gid, 0); + $3 // normalize [-1,1] + int linear_index = $7 * ($4 * width_height.x + gid.x); + output_data.elements[linear_index + 0] = pixel.x; // r channel + $5 // g & b channels + $6 // alpha channel + })", + /*$0=*/kWorkgroupSize, /*$1=*/input.width(), /*$2=*/input.height(), + /*$3=*/ + output_range_.has_value() + ? absl::Substitute("pixel = pixel * float($0) + float($1);", + (output_range_->second - output_range_->first), + output_range_->first) + : "", + /*$4=*/flip_vertically_ ? "(width_height.y - 1 - gid.y)" : "gid.y", + /*$5=*/ + single_channel ? "" + : R"(output_data.elements[linear_index + 1] = pixel.y; + output_data.elements[linear_index + 2] = pixel.z;)", + /*$6=*/ + include_alpha ? "output_data.elements[linear_index + 3] = pixel.w;" + : "", + /*$7=*/max_num_channels_); + GLuint shader = glCreateShader(GL_COMPUTE_SHADER); + const GLchar* sources[] = {shader_source.c_str()}; + glShaderSource(shader, 1, sources, NULL); + glCompileShader(shader); + GLint compiled = GL_FALSE; + glGetShaderiv(shader, GL_COMPILE_STATUS, &compiled); + RET_CHECK(compiled == GL_TRUE); + to_buffer_program_ = glCreateProgram(); + glAttachShader(to_buffer_program_, shader); + glDeleteShader(shader); + glLinkProgram(to_buffer_program_); +#else + // OpenGL ES 3.0 fragment shader Texture2d -> Texture2d conversion. + const std::string shader_source = absl::Substitute( + R"( + #if __VERSION__ < 130 + #define in varying + #endif // __VERSION__ < 130 + + #ifdef GL_ES + #define fragColor gl_FragColor + precision highp float; + #else + #define lowp + #define mediump + #define highp + #define texture2D texture + out $0 fragColor; + #endif // defined(GL_ES) + + in vec2 sample_coordinate; + uniform sampler2D frame; + + void main() { + $1 // flip + vec4 pixel = texture2D(frame, sample_coordinate); + $2 // normalize [-1,1] + fragColor.r = pixel.r; // r channel + $3 // g & b channels + $4 // alpha channel + })", + /*$0=*/single_channel ? "vec1" : "vec4", + /*$1=*/ + flip_vertically_ ? "sample_coordinate.y = 1.0 - sample_coordinate.y;" + : "", + /*$2=*/output_range_.has_value() + ? absl::Substitute("pixel = pixel * float($0) + float($1);", + (output_range_->second - output_range_->first), + output_range_->first) + : "", + /*$3=*/single_channel ? "" : R"(fragColor.g = pixel.g; + fragColor.b = pixel.b;)", + /*$4=*/ + include_alpha ? "fragColor.a = pixel.a;" + : (single_channel ? "" : "fragColor.a = 1.0;")); + + const GLint attr_location[NUM_ATTRIBUTES] = { + ATTRIB_VERTEX, + ATTRIB_TEXTURE_POSITION, + }; + const GLchar* attr_name[NUM_ATTRIBUTES] = { + "position", + "texture_coordinate", + }; + // shader program and params + mediapipe::GlhCreateProgram( + mediapipe::kBasicVertexShader, shader_source.c_str(), NUM_ATTRIBUTES, + &attr_name[0], attr_location, &to_tex2d_program_); + RET_CHECK(to_tex2d_program_) << "Problem initializing the program."; + glUseProgram(to_tex2d_program_); + glUniform1i(glGetUniformLocation(to_tex2d_program_, "frame"), 1); + glGenFramebuffers(1, &framebuffer_); + +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + return ::mediapipe::OkStatus(); + })); +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 +#endif // !MEDIAPIPE_DISABLE_GPU + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorConverterCalculator::LoadOptions( + CalculatorContext* cc) { + // Get calculator options specified in the graph. + const auto& options = + cc->Options<::mediapipe::TensorConverterCalculatorOptions>(); + + // if zero_center, set output float range to match [-1, 1] as specified in + // calculator proto. + if (options.zero_center()) { + output_range_.emplace(std::pair(-1.0, 1.0)); + } + + // Custom output_tensor_float_range values. + // If the float range is specified in pb text, use the specified values + // instead. + if (options.has_output_tensor_float_range()) { + output_range_.emplace(options.output_tensor_float_range().min(), + options.output_tensor_float_range().max()); + CHECK_GT(output_range_->second, output_range_->first); + } + + // Custom div and sub values. + if (options.use_custom_normalization()) { + output_range_.emplace(std::pair( + -options.custom_sub(), + -options.custom_sub() + 255.0 / options.custom_div())); + } + + // Get y-flip mode. + flip_vertically_ = options.flip_vertically(); + + // Get row_major_matrix mode. + row_major_matrix_ = options.row_major_matrix(); + + // Get desired way to handle input channels. + max_num_channels_ = options.max_num_channels(); + CHECK_GE(max_num_channels_, 1); + CHECK_LE(max_num_channels_, 4); + CHECK_NE(max_num_channels_, 2); + return ::mediapipe::OkStatus(); +} + +template +::mediapipe::Status TensorConverterCalculator::NormalizeImage( + const ImageFrame& image_frame, bool flip_vertically, float* tensor_ptr) { + const int height = image_frame.Height(); + const int width = image_frame.Width(); + const int channels = image_frame.NumberOfChannels(); + const int channels_preserved = std::min(channels, max_num_channels_); + const int channels_ignored = channels - channels_preserved; + + if (output_range_.has_value()) { + // If the output float range is set and we are not using custom + // normalization, normalize the pixel values from [0, 255] to the specified + // output range. + RET_CHECK_NE(output_range_->first, output_range_->second); + const float scale = (output_range_->second - output_range_->first) / 255.0f; + const float bias = output_range_->first; + + for (int i = 0; i < height; ++i) { + const T* image_ptr = reinterpret_cast( + image_frame.PixelData() + + (flip_vertically ? height - 1 - i : i) * image_frame.WidthStep()); + for (int j = 0; j < width; ++j) { + for (int c = 0; c < channels_preserved; ++c) { + *tensor_ptr++ = *image_ptr++ * scale + bias; + } + image_ptr += channels_ignored; + } + } + } else { + // [0,1], scale only (bias == 0) + // Verified that there are no precision issues with 1.0f / 255.0f expression + const float scale = 1.0f / 255.0f; + for (int i = 0; i < height; ++i) { + const T* image_ptr = reinterpret_cast( + image_frame.PixelData() + + (flip_vertically ? height - 1 - i : i) * image_frame.WidthStep()); + for (int j = 0; j < width; ++j) { + for (int c = 0; c < channels_preserved; ++c) { + *tensor_ptr++ = *image_ptr++ * scale; + } + image_ptr += channels_ignored; + } + } + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorConverterCalculator::CopyMatrixToTensor( + const Matrix& matrix, float* tensor_ptr) { + if (row_major_matrix_) { + auto matrix_map = + Eigen::Map(tensor_ptr, matrix.rows(), matrix.cols()); + matrix_map = matrix; + } else { + auto matrix_map = + Eigen::Map(tensor_ptr, matrix.rows(), matrix.cols()); + matrix_map = matrix; + } + + return ::mediapipe::OkStatus(); +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/tensor_converter_calculator.proto b/mediapipe/calculators/tensor/tensor_converter_calculator.proto new file mode 100644 index 0000000000..97c2154a04 --- /dev/null +++ b/mediapipe/calculators/tensor/tensor_converter_calculator.proto @@ -0,0 +1,69 @@ +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +// Full Example: +// +// node { +// calculator: "TensorConverterCalculator" +// input_stream: "IMAGE_IN:input_image" +// output_stream: "TENSOR_OUT:image_tensor" +// options { +// [mediapipe.TensorConverterCalculatorOptions.ext] { +// zero_center: true +// } +// } +// } +// +message TensorConverterCalculatorOptions { + extend mediapipe.CalculatorOptions { + optional TensorConverterCalculatorOptions ext = 335742637; + } + + // Choose normalization mode for output (not applied for Matrix inputs). + // true = [-1,1] + // false = [0,1] + // Ignored if using quantization. + optional bool zero_center = 1 [default = true]; + + // Custom settings to override the internal scaling factors `div` and `sub`. + // Both values must be set to non-negative values. Will only take effect on + // CPU AND when |use_custom_normalization| is set to true. When these custom + // values take effect, the |zero_center| setting above will be overriden, and + // the normalized_value will be calculated as: + // normalized_value = input / custom_div - custom_sub. + optional bool use_custom_normalization = 6 [default = false]; + optional float custom_div = 7 [default = -1.0]; + optional float custom_sub = 8 [default = -1.0]; + + // Whether the input image should be flipped vertically (along the + // y-direction). This is useful, for example, when the input image is defined + // with a coordinate system where the origin is at the bottom-left corner + // (e.g., in OpenGL) whereas the ML model expects an image with a top-left + // origin. + optional bool flip_vertically = 2 [default = false]; + + // Controls how many channels of the input image get passed through to the + // tensor. Valid values are 1,3,4 only. Ignored for iOS GPU. + optional int32 max_num_channels = 3 [default = 3]; + + // The calculator expects Matrix inputs to be in column-major order. Set + // row_major_matrix to true if the inputs are in row-major order. + optional bool row_major_matrix = 4 [default = false]; + + // Quantization option (CPU only). + // When true, output kUint8 tensor instead of kFloat32. + optional bool use_quantized_tensors = 5 [default = false]; + + // Normalization option. + // Setting normalization_range results in the values normalized to + // the range [output_tensor_float_range.min, output_tensor_float_range.max]. + optional TensorFloatRange output_tensor_float_range = 9; + + message TensorFloatRange { + optional float min = 1; + optional float max = 2; + } +} diff --git a/mediapipe/calculators/tensor/tensor_converter_calculator_test.cc b/mediapipe/calculators/tensor/tensor_converter_calculator_test.cc new file mode 100644 index 0000000000..eccd8c73f2 --- /dev/null +++ b/mediapipe/calculators/tensor/tensor_converter_calculator_test.cc @@ -0,0 +1,323 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "absl/memory/memory.h" +#include "absl/strings/substitute.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/calculator_runner.h" +#include "mediapipe/framework/formats/image_format.pb.h" +#include "mediapipe/framework/formats/image_frame.h" +#include "mediapipe/framework/formats/image_frame_opencv.h" +#include "mediapipe/framework/formats/matrix.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port/gtest.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/parse_text_proto.h" +#include "mediapipe/framework/port/status_matchers.h" // NOLINT +#include "mediapipe/framework/tool/validate_type.h" + +namespace mediapipe { +namespace { + +constexpr char kTransposeOptionsString[] = + "[mediapipe.TensorConverterCalculatorOptions.ext]: {" + "row_major_matrix: True}"; + +} // namespace + +using RandomEngine = std::mt19937_64; +using testing::Eq; +const uint32 kSeed = 1234; +const int kNumSizes = 8; +const int sizes[kNumSizes][2] = {{1, 1}, {12, 1}, {1, 9}, {2, 2}, + {5, 3}, {7, 13}, {16, 32}, {101, 2}}; + +class TensorConverterCalculatorTest : public ::testing::Test { + protected: + // Adds a packet with a matrix filled with random values in [0,1]. + void AddRandomMatrix(int num_rows, int num_columns, uint32 seed, + bool row_major_matrix = false) { + RandomEngine random(kSeed); + std::uniform_real_distribution<> uniform_dist(0, 1.0); + auto matrix = ::absl::make_unique(); + matrix->resize(num_rows, num_columns); + if (row_major_matrix) { + for (int y = 0; y < num_rows; ++y) { + for (int x = 0; x < num_columns; ++x) { + float value = uniform_dist(random); + (*matrix)(y, x) = value; + } + } + } else { + for (int x = 0; x < num_columns; ++x) { + for (int y = 0; y < num_rows; ++y) { + float value = uniform_dist(random); + (*matrix)(y, x) = value; + } + } + } + MP_ASSERT_OK(graph_->AddPacketToInputStream( + "matrix", Adopt(matrix.release()).At(Timestamp(0)))); + } + + std::unique_ptr graph_; +}; + +TEST_F(TensorConverterCalculatorTest, RandomMatrixColMajor) { + for (int size_index = 0; size_index < kNumSizes; ++size_index) { + const int num_rows = sizes[size_index][0]; + const int num_columns = sizes[size_index][1]; + + // Run the calculator and verify that one output is generated. + CalculatorGraphConfig graph_config = + ::mediapipe::ParseTextProtoOrDie(R"( + input_stream: "matrix" + node { + calculator: "TensorConverterCalculator" + input_stream: "MATRIX:matrix" + output_stream: "TENSORS:tensor" + options { + [mediapipe.TensorConverterCalculatorOptions.ext] { + row_major_matrix: false + } + } + } + )"); + std::vector output_packets; + tool::AddVectorSink("tensor", &graph_config, &output_packets); + + // Run the graph. + graph_ = absl::make_unique(); + MP_ASSERT_OK(graph_->Initialize(graph_config)); + MP_ASSERT_OK(graph_->StartRun({})); + + // Push the tensor into the graph. + AddRandomMatrix(num_rows, num_columns, kSeed, /*row_major_matrix=*/false); + + // Wait until the calculator done processing. + MP_ASSERT_OK(graph_->WaitUntilIdle()); + EXPECT_EQ(1, output_packets.size()); + + // Get and process results. + const std::vector& tensor_vec = + output_packets[0].Get>(); + EXPECT_EQ(1, tensor_vec.size()); + + const Tensor* tensor = &tensor_vec[0]; + EXPECT_EQ(Tensor::ElementType::kFloat32, tensor->element_type()); + + // Verify that the data is correct. + RandomEngine random(kSeed); + std::uniform_real_distribution<> uniform_dist(0, 1.0); + auto view = tensor->GetCpuReadView(); + auto tensor_buffer = view.buffer(); + for (int i = 0; i < num_rows * num_columns; ++i) { + const float expected = uniform_dist(random); + EXPECT_EQ(expected, tensor_buffer[i]) << "at i = " << i; + } + + // Fully close graph at end, otherwise calculator+tensors are destroyed + // after calling WaitUntilDone(). + MP_ASSERT_OK(graph_->CloseInputStream("matrix")); + MP_ASSERT_OK(graph_->WaitUntilDone()); + + graph_.reset(); + } +} + +TEST_F(TensorConverterCalculatorTest, RandomMatrixRowMajor) { + for (int size_index = 0; size_index < kNumSizes; ++size_index) { + const int num_rows = sizes[size_index][0]; + const int num_columns = sizes[size_index][1]; + + // Run the calculator and verify that one output is generated. + CalculatorGraphConfig graph_config = + ::mediapipe::ParseTextProtoOrDie(R"( + input_stream: "matrix" + node { + calculator: "TensorConverterCalculator" + input_stream: "MATRIX:matrix" + output_stream: "TENSORS:tensor" + options { + [mediapipe.TensorConverterCalculatorOptions.ext] { + row_major_matrix: true + } + } + } + )"); + std::vector output_packets; + tool::AddVectorSink("tensor", &graph_config, &output_packets); + + // Run the graph. + graph_ = absl::make_unique(); + MP_ASSERT_OK(graph_->Initialize(graph_config)); + MP_ASSERT_OK(graph_->StartRun({})); + + // Push the tensor into the graph. + AddRandomMatrix(num_rows, num_columns, kSeed, /*row_major_matrix=*/true); + + // Wait until the calculator done processing. + MP_ASSERT_OK(graph_->WaitUntilIdle()); + EXPECT_EQ(1, output_packets.size()); + + // Get and process results. + const std::vector& tensor_vec = + output_packets[0].Get>(); + EXPECT_EQ(1, tensor_vec.size()); + + const Tensor* tensor = &tensor_vec[0]; + EXPECT_EQ(Tensor::ElementType::kFloat32, tensor->element_type()); + + // Verify that the data is correct. + RandomEngine random(kSeed); + std::uniform_real_distribution<> uniform_dist(0, 1.0); + auto view = tensor->GetCpuReadView(); + auto tensor_buffer = view.buffer(); + for (int i = 0; i < num_rows * num_columns; ++i) { + const float expected = uniform_dist(random); + EXPECT_EQ(expected, tensor_buffer[i]) << "at i = " << i; + } + + // Fully close graph at end, otherwise calculator+tensors are destroyed + // after calling WaitUntilDone(). + MP_ASSERT_OK(graph_->CloseInputStream("matrix")); + MP_ASSERT_OK(graph_->WaitUntilDone()); + + graph_.reset(); + } +} + +TEST_F(TensorConverterCalculatorTest, CustomDivAndSub) { + CalculatorGraph graph; + // Run the calculator and verify that one output is generated. + CalculatorGraphConfig graph_config = + ::mediapipe::ParseTextProtoOrDie(R"( + input_stream: "input_image" + node { + calculator: "TensorConverterCalculator" + input_stream: "IMAGE:input_image" + output_stream: "TENSORS:tensor" + options { + [mediapipe.TensorConverterCalculatorOptions.ext] { + row_major_matrix: true + use_custom_normalization: true + custom_div: 2.0 + custom_sub: 33.0 + } + } + } + )"); + std::vector output_packets; + tool::AddVectorSink("tensor", &graph_config, &output_packets); + + // Run the graph. + MP_ASSERT_OK(graph.Initialize(graph_config)); + MP_ASSERT_OK(graph.StartRun({})); + auto input_image = absl::make_unique(ImageFormat::GRAY8, 1, 1); + cv::Mat mat = ::mediapipe::formats::MatView(input_image.get()); + mat.at(0, 0) = 200; + MP_ASSERT_OK(graph.AddPacketToInputStream( + "input_image", Adopt(input_image.release()).At(Timestamp(0)))); + + // Wait until the calculator done processing. + MP_ASSERT_OK(graph.WaitUntilIdle()); + + // Get and process results. + const std::vector& tensor_vec = + output_packets[0].Get>(); + EXPECT_EQ(1, tensor_vec.size()); + + const Tensor* tensor = &tensor_vec[0]; + EXPECT_EQ(Tensor::ElementType::kFloat32, tensor->element_type()); + auto view = tensor->GetCpuReadView(); + EXPECT_FLOAT_EQ(67.0f, *view.buffer()); + + // Fully close graph at end, otherwise calculator+tensors are destroyed + // after calling WaitUntilDone(). + MP_ASSERT_OK(graph.CloseInputStream("input_image")); + MP_ASSERT_OK(graph.WaitUntilDone()); +} + +TEST_F(TensorConverterCalculatorTest, SetOutputRange) { + std::vector> range_values = { + std::make_pair(0.0, 1.0), std::make_pair(-1.0, 1.0), + std::make_pair(-0.5, 0.5)}; + for (std::pair range : range_values) { + CalculatorGraph graph; + CalculatorGraphConfig graph_config = + ::mediapipe::ParseTextProtoOrDie( + absl::Substitute(R"( + input_stream: "input_image" + node { + calculator: "TensorConverterCalculator" + input_stream: "IMAGE:input_image" + output_stream: "TENSORS:tensor" + options { + [mediapipe.TensorConverterCalculatorOptions.ext] { + output_tensor_float_range { + min: $0 + max: $1 + } + } + } + } + )", + /*$0=*/range.first, + /*$1=*/range.second)); + std::vector output_packets; + tool::AddVectorSink("tensor", &graph_config, &output_packets); + + // Run the graph. + MP_ASSERT_OK(graph.Initialize(graph_config)); + MP_ASSERT_OK(graph.StartRun({})); + auto input_image = absl::make_unique(ImageFormat::GRAY8, 1, 1); + cv::Mat mat = ::mediapipe::formats::MatView(input_image.get()); + mat.at(0, 0) = 200; + MP_ASSERT_OK(graph.AddPacketToInputStream( + "input_image", Adopt(input_image.release()).At(Timestamp(0)))); + + // Wait until the calculator finishes processing. + MP_ASSERT_OK(graph.WaitUntilIdle()); + EXPECT_THAT(output_packets.size(), Eq(1)); + + // Get and process results. + const std::vector& tensor_vec = + output_packets[0].Get>(); + EXPECT_THAT(tensor_vec.size(), Eq(1)); + + const Tensor* tensor = &tensor_vec[0]; + + // Calculate the expected normalized value: + float normalized_value = + range.first + (200 * (range.second - range.first)) / 255.0; + + EXPECT_THAT(tensor->element_type(), Eq(Tensor::ElementType::kFloat32)); + auto view = tensor->GetCpuReadView(); + float dataf = *view.buffer(); + EXPECT_THAT( + normalized_value, + testing::FloatNear(dataf, 2.0f * std::abs(dataf) * + std::numeric_limits::epsilon())); + + // Fully close graph at end, otherwise calculator+tensors are destroyed + // after calling WaitUntilDone(). + MP_ASSERT_OK(graph.CloseInputStream("input_image")); + MP_ASSERT_OK(graph.WaitUntilDone()); + } +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/tensors_to_classification_calculator.cc b/mediapipe/calculators/tensor/tensors_to_classification_calculator.cc new file mode 100644 index 0000000000..6dca95d8e2 --- /dev/null +++ b/mediapipe/calculators/tensor/tensors_to_classification_calculator.cc @@ -0,0 +1,197 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "absl/strings/str_format.h" +#include "absl/types/span.h" +#include "mediapipe/calculators/tensor/tensors_to_classification_calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/classification.pb.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/util/resource_util.h" +#if defined(MEDIAPIPE_MOBILE) +#include "mediapipe/util/android/file/base/file.h" +#include "mediapipe/util/android/file/base/helpers.h" +#else +#include "mediapipe/framework/port/file_helpers.h" +#endif + +namespace mediapipe { + +// Convert result tensors from classification models into MediaPipe +// classifications. +// +// Input: +// TENSORS - Vector of Tensors of type kFloat32 containing one +// tensor, the size of which must be (1, * num_classes). +// Output: +// CLASSIFICATIONS - Result MediaPipe ClassificationList. The score and index +// fields of each classification are set, while the label +// field is only set if label_map_path is provided. +// +// Usage example: +// node { +// calculator: "TensorsToClassificationCalculator" +// input_stream: "TENSORS:tensors" +// output_stream: "CLASSIFICATIONS:classifications" +// options: { +// [mediapipe.TensorsToClassificationCalculatorOptions.ext] { +// num_classes: 1024 +// min_score_threshold: 0.1 +// label_map_path: "labelmap.txt" +// } +// } +// } +class TensorsToClassificationCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + ::mediapipe::Status Close(CalculatorContext* cc) override; + + private: + ::mediapipe::TensorsToClassificationCalculatorOptions options_; + int top_k_ = 0; + std::unordered_map label_map_; + bool label_map_loaded_ = false; +}; +REGISTER_CALCULATOR(TensorsToClassificationCalculator); + +::mediapipe::Status TensorsToClassificationCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + + if (cc->Inputs().HasTag("TENSORS")) { + cc->Inputs().Tag("TENSORS").Set>(); + } + + if (cc->Outputs().HasTag("CLASSIFICATIONS")) { + cc->Outputs().Tag("CLASSIFICATIONS").Set(); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToClassificationCalculator::Open( + CalculatorContext* cc) { + cc->SetOffset(TimestampDiff(0)); + + options_ = + cc->Options<::mediapipe::TensorsToClassificationCalculatorOptions>(); + + top_k_ = options_.top_k(); + if (options_.has_label_map_path()) { + std::string string_path; + ASSIGN_OR_RETURN(string_path, + PathToResourceAsFile(options_.label_map_path())); + std::string label_map_string; + MP_RETURN_IF_ERROR(file::GetContents(string_path, &label_map_string)); + + std::istringstream stream(label_map_string); + std::string line; + int i = 0; + while (std::getline(stream, line)) { + label_map_[i++] = line; + } + label_map_loaded_ = true; + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToClassificationCalculator::Process( + CalculatorContext* cc) { + const auto& input_tensors = + cc->Inputs().Tag("TENSORS").Get>(); + + RET_CHECK_EQ(input_tensors.size(), 1); + + int num_classes = input_tensors[0].shape().num_elements(); + + if (options_.binary_classification()) { + RET_CHECK_EQ(num_classes, 1); + // Number of classes for binary classification. + num_classes = 2; + } + if (label_map_loaded_) { + RET_CHECK_EQ(num_classes, label_map_.size()); + } + auto view = input_tensors[0].GetCpuReadView(); + auto raw_scores = view.buffer(); + + auto classification_list = absl::make_unique(); + if (options_.binary_classification()) { + Classification* class_first = classification_list->add_classification(); + Classification* class_second = classification_list->add_classification(); + class_first->set_index(0); + class_second->set_index(1); + class_first->set_score(raw_scores[0]); + class_second->set_score(1. - raw_scores[0]); + + if (label_map_loaded_) { + class_first->set_label(label_map_[0]); + class_second->set_label(label_map_[1]); + } + } else { + for (int i = 0; i < num_classes; ++i) { + if (options_.has_min_score_threshold() && + raw_scores[i] < options_.min_score_threshold()) { + continue; + } + Classification* classification = + classification_list->add_classification(); + classification->set_index(i); + classification->set_score(raw_scores[i]); + + if (label_map_loaded_) { + classification->set_label(label_map_[i]); + } + } + } + + // Note that partial_sort will raise error when top_k_ > + // classification_list->classification_size(). + CHECK_GE(classification_list->classification_size(), top_k_); + auto raw_classification_list = classification_list->mutable_classification(); + if (top_k_ > 0 && classification_list->classification_size() >= top_k_) { + std::partial_sort(raw_classification_list->begin(), + raw_classification_list->begin() + top_k_, + raw_classification_list->end(), + [](const Classification a, const Classification b) { + return a.score() > b.score(); + }); + + // Resizes the underlying list to have only top_k_ classifications. + raw_classification_list->DeleteSubrange( + top_k_, raw_classification_list->size() - top_k_); + } + cc->Outputs() + .Tag("CLASSIFICATIONS") + .Add(classification_list.release(), cc->InputTimestamp()); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToClassificationCalculator::Close( + CalculatorContext* cc) { + return ::mediapipe::OkStatus(); +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/tensors_to_classification_calculator.proto b/mediapipe/calculators/tensor/tensors_to_classification_calculator.proto new file mode 100644 index 0000000000..51f7f3f902 --- /dev/null +++ b/mediapipe/calculators/tensor/tensors_to_classification_calculator.proto @@ -0,0 +1,41 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The option proto for the TensorsToClassificationCalculator. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message TensorsToClassificationCalculatorOptions { + extend .mediapipe.CalculatorOptions { + optional TensorsToClassificationCalculatorOptions ext = 335742638; + } + + // Score threshold for perserving the class. + optional float min_score_threshold = 1; + // Number of highest scoring labels to output. If top_k is not positive then + // all labels are used. + optional int32 top_k = 2; + // Path to a label map file for getting the actual name of class ids. + optional string label_map_path = 3; + // Whether the input is a single float for binary classification. + // When true, only a single float is expected in the input tensor and the + // label map, if provided, is expected to have exactly two labels. + // The single score(float) represent the probability of first label, and + // 1 - score is the probabilility of the second label. + optional bool binary_classification = 4; +} diff --git a/mediapipe/calculators/tensor/tensors_to_classification_calculator_test.cc b/mediapipe/calculators/tensor/tensors_to_classification_calculator_test.cc new file mode 100644 index 0000000000..8e26194299 --- /dev/null +++ b/mediapipe/calculators/tensor/tensors_to_classification_calculator_test.cc @@ -0,0 +1,174 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "absl/memory/memory.h" +#include "mediapipe/calculators/tensor/tensors_to_classification_calculator.pb.h" +#include "mediapipe/framework/calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/calculator_runner.h" +#include "mediapipe/framework/formats/classification.pb.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port/gtest.h" +#include "mediapipe/framework/port/parse_text_proto.h" +#include "mediapipe/framework/port/status_matchers.h" + +namespace mediapipe { + +using ::mediapipe::ParseTextProtoOrDie; +using Node = ::mediapipe::CalculatorGraphConfig::Node; + +class TensorsToClassificationCalculatorTest : public ::testing::Test { + protected: + void BuildGraph(mediapipe::CalculatorRunner* runner, + const std::vector& scores) { + auto tensors = absl::make_unique>(); + tensors->emplace_back( + Tensor::ElementType::kFloat32, + Tensor::Shape{1, 1, static_cast(scores.size()), 1}); + auto view = tensors->back().GetCpuWriteView(); + float* tensor_buffer = view.buffer(); + ASSERT_NE(tensor_buffer, nullptr); + for (int i = 0; i < scores.size(); ++i) { + tensor_buffer[i] = scores[i]; + } + + int64 stream_timestamp = 0; + auto& input_stream_packets = + runner->MutableInputs()->Tag("TENSORS").packets; + + input_stream_packets.push_back( + mediapipe::Adopt(tensors.release()) + .At(mediapipe::Timestamp(stream_timestamp++))); + } +}; + +TEST_F(TensorsToClassificationCalculatorTest, CorrectOutput) { + mediapipe::CalculatorRunner runner(ParseTextProtoOrDie(R"( + calculator: "TensorsToClassificationCalculator" + input_stream: "TENSORS:tensors" + output_stream: "CLASSIFICATIONS:classifications" + options { + [mediapipe.TensorsToClassificationCalculatorOptions.ext] {} + } + )")); + + BuildGraph(&runner, {0, 0.5, 1}); + MP_ASSERT_OK(runner.Run()); + + const auto& output_packets_ = runner.Outputs().Tag("CLASSIFICATIONS").packets; + + EXPECT_EQ(1, output_packets_.size()); + + const auto& classification_list = + output_packets_[0].Get(); + EXPECT_EQ(3, classification_list.classification_size()); + + // Verify that the label_id and score fields are set correctly. + for (int i = 0; i < classification_list.classification_size(); ++i) { + EXPECT_EQ(i, classification_list.classification(i).index()); + EXPECT_EQ(i * 0.5, classification_list.classification(i).score()); + ASSERT_FALSE(classification_list.classification(i).has_label()); + } +} + +TEST_F(TensorsToClassificationCalculatorTest, CorrectOutputWithLabelMapPath) { + mediapipe::CalculatorRunner runner(ParseTextProtoOrDie(R"( + calculator: "TensorsToClassificationCalculator" + input_stream: "TENSORS:tensors" + output_stream: "CLASSIFICATIONS:classifications" + options { + [mediapipe.TensorsToClassificationCalculatorOptions.ext] { + label_map_path: "mediapipe/calculators/tensor/testdata/labelmap.txt" + } + } + )")); + + BuildGraph(&runner, {0, 0.5, 1}); + MP_ASSERT_OK(runner.Run()); + + const auto& output_packets_ = runner.Outputs().Tag("CLASSIFICATIONS").packets; + + EXPECT_EQ(1, output_packets_.size()); + + const auto& classification_list = + output_packets_[0].Get(); + EXPECT_EQ(3, classification_list.classification_size()); + + // Verify that the label field is set. + for (int i = 0; i < classification_list.classification_size(); ++i) { + EXPECT_EQ(i, classification_list.classification(i).index()); + EXPECT_EQ(i * 0.5, classification_list.classification(i).score()); + ASSERT_TRUE(classification_list.classification(i).has_label()); + } +} + +TEST_F(TensorsToClassificationCalculatorTest, + CorrectOutputWithLabelMinScoreThreshold) { + mediapipe::CalculatorRunner runner(ParseTextProtoOrDie(R"( + calculator: "TensorsToClassificationCalculator" + input_stream: "TENSORS:tensors" + output_stream: "CLASSIFICATIONS:classifications" + options { + [mediapipe.TensorsToClassificationCalculatorOptions.ext] { + min_score_threshold: 0.6 + } + } + )")); + + BuildGraph(&runner, {0, 0.5, 1}); + MP_ASSERT_OK(runner.Run()); + + const auto& output_packets_ = runner.Outputs().Tag("CLASSIFICATIONS").packets; + + EXPECT_EQ(1, output_packets_.size()); + + const auto& classification_list = + output_packets_[0].Get(); + + // Verify that the low score labels are filtered out. + EXPECT_EQ(1, classification_list.classification_size()); + EXPECT_EQ(1, classification_list.classification(0).score()); +} + +TEST_F(TensorsToClassificationCalculatorTest, CorrectOutputWithTopK) { + mediapipe::CalculatorRunner runner(ParseTextProtoOrDie(R"( + calculator: "TensorsToClassificationCalculator" + input_stream: "TENSORS:tensors" + output_stream: "CLASSIFICATIONS:classifications" + options { + [mediapipe.TensorsToClassificationCalculatorOptions.ext] { top_k: 2 } + } + )")); + + BuildGraph(&runner, {0, 0.5, 1}); + MP_ASSERT_OK(runner.Run()); + + const auto& output_packets_ = runner.Outputs().Tag("CLASSIFICATIONS").packets; + + EXPECT_EQ(1, output_packets_.size()); + + const auto& classification_list = + output_packets_[0].Get(); + + // Verify that the only top2 labels are left. + EXPECT_EQ(2, classification_list.classification_size()); + for (int i = 0; i < classification_list.classification_size(); ++i) { + EXPECT_EQ((classification_list.classification_size() - i) * 0.5, + classification_list.classification(i).score()); + } +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/tensors_to_detections_calculator.cc b/mediapipe/calculators/tensor/tensors_to_detections_calculator.cc new file mode 100644 index 0000000000..39add5062e --- /dev/null +++ b/mediapipe/calculators/tensor/tensors_to_detections_calculator.cc @@ -0,0 +1,1161 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "absl/strings/str_format.h" +#include "absl/types/span.h" +#include "mediapipe/calculators/tensor/tensors_to_detections_calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/deps/file_path.h" +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/framework/formats/location.h" +#include "mediapipe/framework/formats/object_detection/anchor.pb.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port.h" +#include "mediapipe/framework/port/ret_check.h" + +// Note: On Apple platforms MEDIAPIPE_DISABLE_GL_COMPUTE is automatically +// defined in mediapipe/framework/port.h. Therefore, +// "#ifndef MEDIAPIPE_DISABLE_GL_COMPUTE" and "#if MEDIAPIPE_METAL_ENABLED" +// below are mutually exclusive. +#ifndef MEDIAPIPE_DISABLE_GL_COMPUTE +#include "mediapipe/gpu/gl_calculator_helper.h" +#endif // !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) + +#if MEDIAPIPE_METAL_ENABLED +#import +#import +#import + +#import "mediapipe/gpu/MPPMetalHelper.h" +#include "mediapipe/gpu/MPPMetalUtil.h" +#endif // MEDIAPIPE_METAL_ENABLED + +namespace { +constexpr int kNumInputTensorsWithAnchors = 3; +constexpr int kNumCoordsPerBox = 4; +constexpr char kDetectionsTag[] = "DETECTIONS"; +constexpr char kTensorsTag[] = "TENSORS"; +constexpr char kAnchorsTag[] = "ANCHORS"; + +bool CanUseGpu() { +#if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) || MEDIAPIPE_METAL_ENABLED + // TODO: Configure GPU usage policy in individual calculators. + constexpr bool kAllowGpuProcessing = true; + return kAllowGpuProcessing; +#else + return false; +#endif // !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) || MEDIAPIPE_METAL_ENABLED +} +} // namespace + +namespace mediapipe { + +namespace { + +void ConvertRawValuesToAnchors(const float* raw_anchors, int num_boxes, + std::vector* anchors) { + anchors->clear(); + for (int i = 0; i < num_boxes; ++i) { + Anchor new_anchor; + new_anchor.set_y_center(raw_anchors[i * kNumCoordsPerBox + 0]); + new_anchor.set_x_center(raw_anchors[i * kNumCoordsPerBox + 1]); + new_anchor.set_h(raw_anchors[i * kNumCoordsPerBox + 2]); + new_anchor.set_w(raw_anchors[i * kNumCoordsPerBox + 3]); + anchors->push_back(new_anchor); + } +} + +void ConvertAnchorsToRawValues(const std::vector& anchors, + int num_boxes, float* raw_anchors) { + CHECK_EQ(anchors.size(), num_boxes); + int box = 0; + for (const auto& anchor : anchors) { + raw_anchors[box * kNumCoordsPerBox + 0] = anchor.y_center(); + raw_anchors[box * kNumCoordsPerBox + 1] = anchor.x_center(); + raw_anchors[box * kNumCoordsPerBox + 2] = anchor.h(); + raw_anchors[box * kNumCoordsPerBox + 3] = anchor.w(); + ++box; + } +} + +} // namespace + +// Convert result Tensors from object detection models into MediaPipe +// Detections. +// +// Input: +// TENSORS - Vector of Tensors of type kFloat32. The vector of tensors can have +// 2 or 3 tensors. First tensor is the predicted raw boxes/keypoints. +// The size of the values must be (num_boxes * num_predicted_values). +// Second tensor is the score tensor. The size of the valuse must be +// (num_boxes * num_classes). It's optional to pass in a third tensor +// for anchors (e.g. for SSD models) depend on the outputs of the +// detection model. The size of anchor tensor must be (num_boxes * +// 4). +// Output: +// DETECTIONS - Result MediaPipe detections. +// +// Usage example: +// node { +// calculator: "TensorsToDetectionsCalculator" +// input_stream: "TENSORS:tensors" +// input_side_packet: "ANCHORS:anchors" +// output_stream: "DETECTIONS:detections" +// options: { +// [mediapipe.TensorsToDetectionsCalculatorOptions.ext] { +// num_classes: 91 +// num_boxes: 1917 +// num_coords: 4 +// ignore_classes: [0, 1, 2] +// x_scale: 10.0 +// y_scale: 10.0 +// h_scale: 5.0 +// w_scale: 5.0 +// } +// } +// } +class TensorsToDetectionsCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + ::mediapipe::Status Close(CalculatorContext* cc) override; + + private: + ::mediapipe::Status ProcessCPU(CalculatorContext* cc, + std::vector* output_detections); + ::mediapipe::Status ProcessGPU(CalculatorContext* cc, + std::vector* output_detections); + + ::mediapipe::Status LoadOptions(CalculatorContext* cc); + ::mediapipe::Status GpuInit(CalculatorContext* cc); + ::mediapipe::Status DecodeBoxes(const float* raw_boxes, + const std::vector& anchors, + std::vector* boxes); + ::mediapipe::Status ConvertToDetections( + const float* detection_boxes, const float* detection_scores, + const int* detection_classes, std::vector* output_detections); + Detection ConvertToDetection(float box_ymin, float box_xmin, float box_ymax, + float box_xmax, float score, int class_id, + bool flip_vertically); + + int num_classes_ = 0; + int num_boxes_ = 0; + int num_coords_ = 0; + std::set ignore_classes_; + + ::mediapipe::TensorsToDetectionsCalculatorOptions options_; + std::vector anchors_; + bool side_packet_anchors_{}; + +#ifndef MEDIAPIPE_DISABLE_GL_COMPUTE + mediapipe::GlCalculatorHelper gpu_helper_; + GLuint decode_program_; + GLuint score_program_; +#elif MEDIAPIPE_METAL_ENABLED + MPPMetalHelper* gpu_helper_ = nullptr; + id decode_program_; + id score_program_; +#endif // !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) + std::unique_ptr raw_anchors_buffer_; + std::unique_ptr decoded_boxes_buffer_; + std::unique_ptr scored_boxes_buffer_; + + bool gpu_input_ = false; + bool anchors_init_ = false; +}; +REGISTER_CALCULATOR(TensorsToDetectionsCalculator); + +::mediapipe::Status TensorsToDetectionsCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(cc->Inputs().HasTag(kTensorsTag)); + cc->Inputs().Tag(kTensorsTag).Set>(); + + RET_CHECK(cc->Outputs().HasTag(kDetectionsTag)); + cc->Outputs().Tag(kDetectionsTag).Set>(); + + if (cc->InputSidePackets().UsesTags()) { + if (cc->InputSidePackets().HasTag(kAnchorsTag)) { + cc->InputSidePackets().Tag(kAnchorsTag).Set>(); + } + } + + if (CanUseGpu()) { +#ifndef MEDIAPIPE_DISABLE_GL_COMPUTE + MP_RETURN_IF_ERROR(mediapipe::GlCalculatorHelper::UpdateContract(cc)); +#elif MEDIAPIPE_METAL_ENABLED + MP_RETURN_IF_ERROR([MPPMetalHelper updateContract:cc]); +#endif // !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToDetectionsCalculator::Open(CalculatorContext* cc) { + cc->SetOffset(TimestampDiff(0)); + side_packet_anchors_ = cc->InputSidePackets().HasTag(kAnchorsTag); + MP_RETURN_IF_ERROR(LoadOptions(cc)); + + if (CanUseGpu()) { +#ifndef MEDIAPIPE_DISABLE_GL_COMPUTE + MP_RETURN_IF_ERROR(gpu_helper_.Open(cc)); +#elif MEDIAPIPE_METAL_ENABLED + gpu_helper_ = [[MPPMetalHelper alloc] initWithCalculatorContext:cc]; + RET_CHECK(gpu_helper_); +#endif // !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) + MP_RETURN_IF_ERROR(GpuInit(cc)); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToDetectionsCalculator::Process( + CalculatorContext* cc) { + if (cc->Inputs().Tag(kTensorsTag).IsEmpty()) { + return ::mediapipe::OkStatus(); + } + + auto output_detections = absl::make_unique>(); + + bool gpu_processing = false; + if (CanUseGpu()) { + // Use GPU processing only if at least one input tensor is already on GPU + // (to avoid CPU->GPU overhead). + for (const auto& tensor : + cc->Inputs().Tag(kTensorsTag).Get>()) { + if (tensor.ready_on_gpu()) { + gpu_processing = true; + break; + } + } + } + + if (gpu_processing) { + MP_RETURN_IF_ERROR(ProcessGPU(cc, output_detections.get())); + } else { + MP_RETURN_IF_ERROR(ProcessCPU(cc, output_detections.get())); + } + + // Output + cc->Outputs() + .Tag(kDetectionsTag) + .Add(output_detections.release(), cc->InputTimestamp()); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToDetectionsCalculator::ProcessCPU( + CalculatorContext* cc, std::vector* output_detections) { + const auto& input_tensors = + cc->Inputs().Tag(kTensorsTag).Get>(); + + if (input_tensors.size() == 2 || + input_tensors.size() == kNumInputTensorsWithAnchors) { + // Postprocessing on CPU for model without postprocessing op. E.g. output + // raw score tensor and box tensor. Anchor decoding will be handled below. + // TODO: Add flexible input tensor size handling. + auto raw_box_tensor = &input_tensors[0]; + RET_CHECK_EQ(raw_box_tensor->shape().dims.size(), 3); + RET_CHECK_EQ(raw_box_tensor->shape().dims[0], 1); + RET_CHECK_EQ(raw_box_tensor->shape().dims[1], num_boxes_); + RET_CHECK_EQ(raw_box_tensor->shape().dims[2], num_coords_); + auto raw_score_tensor = &input_tensors[1]; + RET_CHECK_EQ(raw_score_tensor->shape().dims.size(), 3); + RET_CHECK_EQ(raw_score_tensor->shape().dims[0], 1); + RET_CHECK_EQ(raw_score_tensor->shape().dims[1], num_boxes_); + RET_CHECK_EQ(raw_score_tensor->shape().dims[2], num_classes_); + auto raw_box_view = raw_box_tensor->GetCpuReadView(); + auto raw_boxes = raw_box_view.buffer(); + auto raw_scores_view = raw_score_tensor->GetCpuReadView(); + auto raw_scores = raw_scores_view.buffer(); + + // TODO: Support other options to load anchors. + if (!anchors_init_) { + if (input_tensors.size() == kNumInputTensorsWithAnchors) { + auto anchor_tensor = &input_tensors[2]; + RET_CHECK_EQ(anchor_tensor->shape().dims.size(), 2); + RET_CHECK_EQ(anchor_tensor->shape().dims[0], num_boxes_); + RET_CHECK_EQ(anchor_tensor->shape().dims[1], kNumCoordsPerBox); + auto anchor_view = anchor_tensor->GetCpuReadView(); + auto raw_anchors = anchor_view.buffer(); + ConvertRawValuesToAnchors(raw_anchors, num_boxes_, &anchors_); + } else if (side_packet_anchors_) { + CHECK(!cc->InputSidePackets().Tag("ANCHORS").IsEmpty()); + anchors_ = + cc->InputSidePackets().Tag("ANCHORS").Get>(); + } else { + return ::mediapipe::UnavailableError("No anchor data available."); + } + anchors_init_ = true; + } + std::vector boxes(num_boxes_ * num_coords_); + MP_RETURN_IF_ERROR(DecodeBoxes(raw_boxes, anchors_, &boxes)); + + std::vector detection_scores(num_boxes_); + std::vector detection_classes(num_boxes_); + + // Filter classes by scores. + for (int i = 0; i < num_boxes_; ++i) { + int class_id = -1; + float max_score = -std::numeric_limits::max(); + // Find the top score for box i. + for (int score_idx = 0; score_idx < num_classes_; ++score_idx) { + if (ignore_classes_.find(score_idx) == ignore_classes_.end()) { + auto score = raw_scores[i * num_classes_ + score_idx]; + if (options_.sigmoid_score()) { + if (options_.has_score_clipping_thresh()) { + score = score < -options_.score_clipping_thresh() + ? -options_.score_clipping_thresh() + : score; + score = score > options_.score_clipping_thresh() + ? options_.score_clipping_thresh() + : score; + } + score = 1.0f / (1.0f + std::exp(-score)); + } + if (max_score < score) { + max_score = score; + class_id = score_idx; + } + } + } + detection_scores[i] = max_score; + detection_classes[i] = class_id; + } + + MP_RETURN_IF_ERROR( + ConvertToDetections(boxes.data(), detection_scores.data(), + detection_classes.data(), output_detections)); + } else { + // Postprocessing on CPU with postprocessing op (e.g. anchor decoding and + // non-maximum suppression) within the model. + RET_CHECK_EQ(input_tensors.size(), 4); + + auto num_boxes_tensor = &input_tensors[3]; + RET_CHECK_EQ(num_boxes_tensor->shape().dims.size(), 1); + RET_CHECK_EQ(num_boxes_tensor->shape().dims[0], 1); + + auto detection_boxes_tensor = &input_tensors[0]; + RET_CHECK_EQ(detection_boxes_tensor->shape().dims.size(), 3); + RET_CHECK_EQ(detection_boxes_tensor->shape().dims[0], 1); + const int max_detections = detection_boxes_tensor->shape().dims[1]; + RET_CHECK_EQ(detection_boxes_tensor->shape().dims[2], num_coords_); + + auto detection_classes_tensor = &input_tensors[1]; + RET_CHECK_EQ(detection_classes_tensor->shape().dims.size(), 2); + RET_CHECK_EQ(detection_classes_tensor->shape().dims[0], 1); + RET_CHECK_EQ(detection_classes_tensor->shape().dims[1], max_detections); + + auto detection_scores_tensor = &input_tensors[2]; + RET_CHECK_EQ(detection_scores_tensor->shape().dims.size(), 2); + RET_CHECK_EQ(detection_scores_tensor->shape().dims[0], 1); + RET_CHECK_EQ(detection_scores_tensor->shape().dims[1], max_detections); + + auto num_boxes_view = num_boxes_tensor->GetCpuReadView(); + auto num_boxes = num_boxes_view.buffer(); + num_boxes_ = num_boxes[0]; + + auto detection_boxes_view = detection_boxes_tensor->GetCpuReadView(); + auto detection_boxes = detection_boxes_view.buffer(); + + auto detection_scores_view = detection_scores_tensor->GetCpuReadView(); + auto detection_scores = detection_scores_view.buffer(); + + auto detection_classes_view = detection_classes_tensor->GetCpuReadView(); + auto detection_classes_ptr = detection_classes_view.buffer(); + std::vector detection_classes(num_boxes_); + for (int i = 0; i < num_boxes_; ++i) { + detection_classes[i] = static_cast(detection_classes_ptr[i]); + } + MP_RETURN_IF_ERROR(ConvertToDetections(detection_boxes, detection_scores, + detection_classes.data(), + output_detections)); + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToDetectionsCalculator::ProcessGPU( + CalculatorContext* cc, std::vector* output_detections) { + const auto& input_tensors = + cc->Inputs().Tag(kTensorsTag).Get>(); + RET_CHECK_GE(input_tensors.size(), 2); +#ifndef MEDIAPIPE_DISABLE_GL_COMPUTE + + MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext([this, &input_tensors, &cc, + &output_detections]() + -> ::mediapipe::Status { + if (!anchors_init_) { + if (side_packet_anchors_) { + CHECK(!cc->InputSidePackets().Tag(kAnchorsTag).IsEmpty()); + const auto& anchors = + cc->InputSidePackets().Tag(kAnchorsTag).Get>(); + auto anchors_view = raw_anchors_buffer_->GetCpuWriteView(); + auto raw_anchors = anchors_view.buffer(); + ConvertAnchorsToRawValues(anchors, num_boxes_, raw_anchors); + } else { + CHECK_EQ(input_tensors.size(), kNumInputTensorsWithAnchors); + auto read_view = input_tensors[2].GetOpenGlBufferReadView(); + glBindBuffer(GL_COPY_READ_BUFFER, read_view.name()); + auto write_view = raw_anchors_buffer_->GetOpenGlBufferWriteView(); + glBindBuffer(GL_COPY_WRITE_BUFFER, write_view.name()); + glCopyBufferSubData(GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, 0, 0, + input_tensors[2].bytes()); + } + anchors_init_ = true; + } + + // Use the scope to release the writable buffers' views before requesting + // the reading buffers' views. + { + // Decode boxes. + auto decoded_boxes_view = + decoded_boxes_buffer_->GetOpenGlBufferWriteView(); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, decoded_boxes_view.name()); + auto input0_view = input_tensors[0].GetOpenGlBufferReadView(); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, input0_view.name()); + auto raw_anchors_view = raw_anchors_buffer_->GetOpenGlBufferReadView(); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, raw_anchors_view.name()); + glUseProgram(decode_program_); + glDispatchCompute(num_boxes_, 1, 1); + + // Score boxes. + auto scored_boxes_view = scored_boxes_buffer_->GetOpenGlBufferWriteView(); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, scored_boxes_view.name()); + auto input1_view = input_tensors[1].GetOpenGlBufferReadView(); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, input1_view.name()); + glUseProgram(score_program_); + glDispatchCompute(num_boxes_, 1, 1); + } + + // TODO: b/138851969. Is it possible to output a float vector + // for score and an int vector for class so that we can avoid copying twice? + std::vector detection_scores(num_boxes_); + std::vector detection_classes(num_boxes_); + auto score_view = scored_boxes_buffer_->GetCpuReadView(); + auto score_class_id_pairs = score_view.buffer(); + for (int i = 0; i < num_boxes_; ++i) { + detection_scores[i] = score_class_id_pairs[i * 2]; + detection_classes[i] = static_cast(score_class_id_pairs[i * 2 + 1]); + } + auto boxes_view = decoded_boxes_buffer_->GetCpuReadView(); + auto boxes = boxes_view.buffer(); + MP_RETURN_IF_ERROR(ConvertToDetections(boxes, detection_scores.data(), + detection_classes.data(), + output_detections)); + + return ::mediapipe::OkStatus(); + })); +#elif MEDIAPIPE_METAL_ENABLED + id device = gpu_helper_.mtlDevice; + if (!anchors_init_) { + if (side_packet_anchors_) { + CHECK(!cc->InputSidePackets().Tag(kAnchorsTag).IsEmpty()); + const auto& anchors = + cc->InputSidePackets().Tag(kAnchorsTag).Get>(); + auto raw_anchors_view = raw_anchors_buffer_->GetCpuWriteView(); + ConvertAnchorsToRawValues(anchors, num_boxes_, + raw_anchors_view.buffer()); + } else { + RET_CHECK_EQ(input_tensors.size(), kNumInputTensorsWithAnchors); + auto command_buffer = [gpu_helper_ commandBuffer]; + auto src_buffer = input_tensors[2].GetMtlBufferReadView(command_buffer); + auto dest_buffer = + raw_anchors_buffer_->GetMtlBufferWriteView(command_buffer); + id blit_command = + [command_buffer blitCommandEncoder]; + [blit_command copyFromBuffer:src_buffer.buffer() + sourceOffset:0 + toBuffer:dest_buffer.buffer() + destinationOffset:0 + size:input_tensors[2].bytes()]; + [blit_command endEncoding]; + [command_buffer commit]; + } + anchors_init_ = true; + } + + // Use the scope to release the writable buffers' views before requesting the + // reading buffers' views. + id command_buffer = [gpu_helper_ commandBuffer]; + command_buffer.label = @"DecodeAndScoreBoxes"; + id command_encoder = + [command_buffer computeCommandEncoder]; + [command_encoder setComputePipelineState:decode_program_]; + { + auto decoded_boxes_view = + decoded_boxes_buffer_->GetMtlBufferWriteView(command_buffer); + [command_encoder setBuffer:decoded_boxes_view.buffer() offset:0 atIndex:0]; + auto input0_view = input_tensors[0].GetMtlBufferReadView(command_buffer); + [command_encoder setBuffer:input0_view.buffer() offset:0 atIndex:1]; + auto raw_anchors_view = + raw_anchors_buffer_->GetMtlBufferReadView(command_buffer); + [command_encoder setBuffer:raw_anchors_view.buffer() offset:0 atIndex:2]; + MTLSize decode_threads_per_group = MTLSizeMake(1, 1, 1); + MTLSize decode_threadgroups = MTLSizeMake(num_boxes_, 1, 1); + [command_encoder dispatchThreadgroups:decode_threadgroups + threadsPerThreadgroup:decode_threads_per_group]; + + [command_encoder setComputePipelineState:score_program_]; + auto scored_boxes_view = + scored_boxes_buffer_->GetMtlBufferWriteView(command_buffer); + [command_encoder setBuffer:scored_boxes_view.buffer() offset:0 atIndex:0]; + auto input1_view = input_tensors[1].GetMtlBufferReadView(command_buffer); + [command_encoder setBuffer:input1_view.buffer() offset:0 atIndex:1]; + MTLSize score_threads_per_group = MTLSizeMake(1, num_classes_, 1); + MTLSize score_threadgroups = MTLSizeMake(num_boxes_, 1, 1); + [command_encoder dispatchThreadgroups:score_threadgroups + threadsPerThreadgroup:score_threads_per_group]; + [command_encoder endEncoding]; + [command_buffer commit]; + } + + // Output detections. + // TODO Adjust shader to avoid copying shader output twice. + std::vector detection_scores(num_boxes_); + std::vector detection_classes(num_boxes_); + { + auto scored_boxes_view = scored_boxes_buffer_->GetCpuReadView(); + auto score_class_id_pairs = scored_boxes_view.buffer(); + for (int i = 0; i < num_boxes_; ++i) { + detection_scores[i] = score_class_id_pairs[i * 2]; + detection_classes[i] = static_cast(score_class_id_pairs[i * 2 + 1]); + } + } + auto decoded_boxes_view = decoded_boxes_buffer_->GetCpuReadView(); + auto boxes = decoded_boxes_view.buffer(); + MP_RETURN_IF_ERROR(ConvertToDetections(boxes, detection_scores.data(), + detection_classes.data(), + output_detections)); + +#else + LOG(ERROR) << "GPU input on non-Android not supported yet."; +#endif // !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToDetectionsCalculator::Close( + CalculatorContext* cc) { +#ifndef MEDIAPIPE_DISABLE_GL_COMPUTE + gpu_helper_.RunInGlContext([this] { + decoded_boxes_buffer_ = nullptr; + scored_boxes_buffer_ = nullptr; + raw_anchors_buffer_ = nullptr; + glDeleteProgram(decode_program_); + glDeleteProgram(score_program_); + }); +#elif MEDIAPIPE_METAL_ENABLED + decoded_boxes_buffer_ = nullptr; + scored_boxes_buffer_ = nullptr; + raw_anchors_buffer_ = nullptr; + decode_program_ = nil; + score_program_ = nil; +#endif // !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToDetectionsCalculator::LoadOptions( + CalculatorContext* cc) { + // Get calculator options specified in the graph. + options_ = cc->Options<::mediapipe::TensorsToDetectionsCalculatorOptions>(); + RET_CHECK(options_.has_num_classes()); + RET_CHECK(options_.has_num_boxes()); + RET_CHECK(options_.has_num_coords()); + + num_classes_ = options_.num_classes(); + num_boxes_ = options_.num_boxes(); + num_coords_ = options_.num_coords(); + + // Currently only support 2D when num_values_per_keypoint equals to 2. + CHECK_EQ(options_.num_values_per_keypoint(), 2); + + // Check if the output size is equal to the requested boxes and keypoints. + CHECK_EQ(options_.num_keypoints() * options_.num_values_per_keypoint() + + kNumCoordsPerBox, + num_coords_); + + for (int i = 0; i < options_.ignore_classes_size(); ++i) { + ignore_classes_.insert(options_.ignore_classes(i)); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToDetectionsCalculator::DecodeBoxes( + const float* raw_boxes, const std::vector& anchors, + std::vector* boxes) { + for (int i = 0; i < num_boxes_; ++i) { + const int box_offset = i * num_coords_ + options_.box_coord_offset(); + + float y_center = raw_boxes[box_offset]; + float x_center = raw_boxes[box_offset + 1]; + float h = raw_boxes[box_offset + 2]; + float w = raw_boxes[box_offset + 3]; + if (options_.reverse_output_order()) { + x_center = raw_boxes[box_offset]; + y_center = raw_boxes[box_offset + 1]; + w = raw_boxes[box_offset + 2]; + h = raw_boxes[box_offset + 3]; + } + + x_center = + x_center / options_.x_scale() * anchors[i].w() + anchors[i].x_center(); + y_center = + y_center / options_.y_scale() * anchors[i].h() + anchors[i].y_center(); + + if (options_.apply_exponential_on_box_size()) { + h = std::exp(h / options_.h_scale()) * anchors[i].h(); + w = std::exp(w / options_.w_scale()) * anchors[i].w(); + } else { + h = h / options_.h_scale() * anchors[i].h(); + w = w / options_.w_scale() * anchors[i].w(); + } + + const float ymin = y_center - h / 2.f; + const float xmin = x_center - w / 2.f; + const float ymax = y_center + h / 2.f; + const float xmax = x_center + w / 2.f; + + (*boxes)[i * num_coords_ + 0] = ymin; + (*boxes)[i * num_coords_ + 1] = xmin; + (*boxes)[i * num_coords_ + 2] = ymax; + (*boxes)[i * num_coords_ + 3] = xmax; + + if (options_.num_keypoints()) { + for (int k = 0; k < options_.num_keypoints(); ++k) { + const int offset = i * num_coords_ + options_.keypoint_coord_offset() + + k * options_.num_values_per_keypoint(); + + float keypoint_y = raw_boxes[offset]; + float keypoint_x = raw_boxes[offset + 1]; + if (options_.reverse_output_order()) { + keypoint_x = raw_boxes[offset]; + keypoint_y = raw_boxes[offset + 1]; + } + + (*boxes)[offset] = keypoint_x / options_.x_scale() * anchors[i].w() + + anchors[i].x_center(); + (*boxes)[offset + 1] = + keypoint_y / options_.y_scale() * anchors[i].h() + + anchors[i].y_center(); + } + } + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToDetectionsCalculator::ConvertToDetections( + const float* detection_boxes, const float* detection_scores, + const int* detection_classes, std::vector* output_detections) { + for (int i = 0; i < num_boxes_; ++i) { + if (options_.has_min_score_thresh() && + detection_scores[i] < options_.min_score_thresh()) { + continue; + } + const int box_offset = i * num_coords_; + Detection detection = ConvertToDetection( + detection_boxes[box_offset + 0], detection_boxes[box_offset + 1], + detection_boxes[box_offset + 2], detection_boxes[box_offset + 3], + detection_scores[i], detection_classes[i], options_.flip_vertically()); + const auto& bbox = detection.location_data().relative_bounding_box(); + if (bbox.width() < 0 || bbox.height() < 0) { + // Decoded detection boxes could have negative values for width/height due + // to model prediction. Filter out those boxes since some downstream + // calculators may assume non-negative values. (b/171391719) + continue; + } + // Add keypoints. + if (options_.num_keypoints() > 0) { + auto* location_data = detection.mutable_location_data(); + for (int kp_id = 0; kp_id < options_.num_keypoints() * + options_.num_values_per_keypoint(); + kp_id += options_.num_values_per_keypoint()) { + auto keypoint = location_data->add_relative_keypoints(); + const int keypoint_index = + box_offset + options_.keypoint_coord_offset() + kp_id; + keypoint->set_x(detection_boxes[keypoint_index + 0]); + keypoint->set_y(options_.flip_vertically() + ? 1.f - detection_boxes[keypoint_index + 1] + : detection_boxes[keypoint_index + 1]); + } + } + output_detections->emplace_back(detection); + } + return ::mediapipe::OkStatus(); +} + +Detection TensorsToDetectionsCalculator::ConvertToDetection( + float box_ymin, float box_xmin, float box_ymax, float box_xmax, float score, + int class_id, bool flip_vertically) { + Detection detection; + detection.add_score(score); + detection.add_label_id(class_id); + + LocationData* location_data = detection.mutable_location_data(); + location_data->set_format(LocationData::RELATIVE_BOUNDING_BOX); + + LocationData::RelativeBoundingBox* relative_bbox = + location_data->mutable_relative_bounding_box(); + + relative_bbox->set_xmin(box_xmin); + relative_bbox->set_ymin(flip_vertically ? 1.f - box_ymax : box_ymin); + relative_bbox->set_width(box_xmax - box_xmin); + relative_bbox->set_height(box_ymax - box_ymin); + return detection; +} + +::mediapipe::Status TensorsToDetectionsCalculator::GpuInit( + CalculatorContext* cc) { +#ifndef MEDIAPIPE_DISABLE_GL_COMPUTE + MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext([this]() + -> ::mediapipe::Status { + // A shader to decode detection boxes. + const std::string decode_src = absl::Substitute( + R"( #version 310 es + +layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; + +layout(location = 0) uniform vec4 scale; + +layout(std430, binding = 0) writeonly buffer Output { + float data[]; +} boxes; + +layout(std430, binding = 1) readonly buffer Input0 { + float data[]; +} raw_boxes; + +layout(std430, binding = 2) readonly buffer Input1 { + float data[]; +} raw_anchors; + +uint num_coords = uint($0); +int reverse_output_order = int($1); +int apply_exponential = int($2); +int box_coord_offset = int($3); +int num_keypoints = int($4); +int keypt_coord_offset = int($5); +int num_values_per_keypt = int($6); + +void main() { + uint g_idx = gl_GlobalInvocationID.x; // box index + uint box_offset = g_idx * num_coords + uint(box_coord_offset); + uint anchor_offset = g_idx * uint(4); // check kNumCoordsPerBox + + float y_center, x_center, h, w; + + if (reverse_output_order == int(0)) { + y_center = raw_boxes.data[box_offset + uint(0)]; + x_center = raw_boxes.data[box_offset + uint(1)]; + h = raw_boxes.data[box_offset + uint(2)]; + w = raw_boxes.data[box_offset + uint(3)]; + } else { + x_center = raw_boxes.data[box_offset + uint(0)]; + y_center = raw_boxes.data[box_offset + uint(1)]; + w = raw_boxes.data[box_offset + uint(2)]; + h = raw_boxes.data[box_offset + uint(3)]; + } + + float anchor_yc = raw_anchors.data[anchor_offset + uint(0)]; + float anchor_xc = raw_anchors.data[anchor_offset + uint(1)]; + float anchor_h = raw_anchors.data[anchor_offset + uint(2)]; + float anchor_w = raw_anchors.data[anchor_offset + uint(3)]; + + x_center = x_center / scale.x * anchor_w + anchor_xc; + y_center = y_center / scale.y * anchor_h + anchor_yc; + + if (apply_exponential == int(1)) { + h = exp(h / scale.w) * anchor_h; + w = exp(w / scale.z) * anchor_w; + } else { + h = (h / scale.w) * anchor_h; + w = (w / scale.z) * anchor_w; + } + + float ymin = y_center - h / 2.0; + float xmin = x_center - w / 2.0; + float ymax = y_center + h / 2.0; + float xmax = x_center + w / 2.0; + + boxes.data[box_offset + uint(0)] = ymin; + boxes.data[box_offset + uint(1)] = xmin; + boxes.data[box_offset + uint(2)] = ymax; + boxes.data[box_offset + uint(3)] = xmax; + + if (num_keypoints > int(0)){ + for (int k = 0; k < num_keypoints; ++k) { + int kp_offset = + int(g_idx * num_coords) + keypt_coord_offset + k * num_values_per_keypt; + float kp_y, kp_x; + if (reverse_output_order == int(0)) { + kp_y = raw_boxes.data[kp_offset + int(0)]; + kp_x = raw_boxes.data[kp_offset + int(1)]; + } else { + kp_x = raw_boxes.data[kp_offset + int(0)]; + kp_y = raw_boxes.data[kp_offset + int(1)]; + } + boxes.data[kp_offset + int(0)] = kp_x / scale.x * anchor_w + anchor_xc; + boxes.data[kp_offset + int(1)] = kp_y / scale.y * anchor_h + anchor_yc; + } + } +})", + options_.num_coords(), // box xywh + options_.reverse_output_order() ? 1 : 0, + options_.apply_exponential_on_box_size() ? 1 : 0, + options_.box_coord_offset(), options_.num_keypoints(), + options_.keypoint_coord_offset(), options_.num_values_per_keypoint()); + + // Shader program + GLuint shader = glCreateShader(GL_COMPUTE_SHADER); + const GLchar* sources[] = {decode_src.c_str()}; + glShaderSource(shader, 1, sources, NULL); + glCompileShader(shader); + GLint compiled = GL_FALSE; + glGetShaderiv(shader, GL_COMPILE_STATUS, &compiled); + RET_CHECK(compiled == GL_TRUE); + decode_program_ = glCreateProgram(); + glAttachShader(decode_program_, shader); + glDeleteShader(shader); + glLinkProgram(decode_program_); + + // Outputs + decoded_boxes_buffer_ = + absl::make_unique(Tensor::ElementType::kFloat32, + Tensor::Shape{1, num_boxes_ * num_coords_}); + raw_anchors_buffer_ = absl::make_unique( + Tensor::ElementType::kFloat32, + Tensor::Shape{1, num_boxes_ * kNumCoordsPerBox}); + // Parameters + glUseProgram(decode_program_); + glUniform4f(0, options_.x_scale(), options_.y_scale(), options_.w_scale(), + options_.h_scale()); + + // A shader to score detection boxes. + const std::string score_src = absl::Substitute( + R"( #version 310 es + +layout(local_size_x = 1, local_size_y = $0, local_size_z = 1) in; + +#define FLT_MAX 1.0e+37 + +shared float local_scores[$0]; + +layout(std430, binding = 0) writeonly buffer Output { + float data[]; +} scored_boxes; + +layout(std430, binding = 1) readonly buffer Input0 { + float data[]; +} raw_scores; + +uint num_classes = uint($0); +int apply_sigmoid = int($1); +int apply_clipping_thresh = int($2); +float clipping_thresh = float($3); +int ignore_class_0 = int($4); + +float optional_sigmoid(float x) { + if (apply_sigmoid == int(0)) return x; + if (apply_clipping_thresh == int(1)) { + x = clamp(x, -clipping_thresh, clipping_thresh); + } + x = 1.0 / (1.0 + exp(-x)); + return x; +} + +void main() { + uint g_idx = gl_GlobalInvocationID.x; // box idx + uint s_idx = gl_LocalInvocationID.y; // score/class idx + + // load all scores into shared memory + float score = raw_scores.data[g_idx * num_classes + s_idx]; + local_scores[s_idx] = optional_sigmoid(score); + memoryBarrierShared(); + barrier(); + + // find max score in shared memory + if (s_idx == uint(0)) { + float max_score = -FLT_MAX; + float max_class = -1.0; + for (int i=ignore_class_0; i max_score) { + max_score = local_scores[i]; + max_class = float(i); + } + } + scored_boxes.data[g_idx * uint(2) + uint(0)] = max_score; + scored_boxes.data[g_idx * uint(2) + uint(1)] = max_class; + } +})", + num_classes_, options_.sigmoid_score() ? 1 : 0, + options_.has_score_clipping_thresh() ? 1 : 0, + options_.has_score_clipping_thresh() ? options_.score_clipping_thresh() + : 0, + !ignore_classes_.empty() ? 1 : 0); + + // # filter classes supported is hardware dependent. + int max_wg_size; // typically <= 1024 + glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1, + &max_wg_size); // y-dim + CHECK_LT(num_classes_, max_wg_size) + << "# classes must be < " << max_wg_size; + // TODO support better filtering. + CHECK_LE(ignore_classes_.size(), 1) << "Only ignore class 0 is allowed"; + + // Shader program + { + GLuint shader = glCreateShader(GL_COMPUTE_SHADER); + const GLchar* sources[] = {score_src.c_str()}; + glShaderSource(shader, 1, sources, NULL); + glCompileShader(shader); + GLint compiled = GL_FALSE; + glGetShaderiv(shader, GL_COMPILE_STATUS, &compiled); + RET_CHECK(compiled == GL_TRUE); + score_program_ = glCreateProgram(); + glAttachShader(score_program_, shader); + glDeleteShader(shader); + glLinkProgram(score_program_); + } + + // Outputs + scored_boxes_buffer_ = absl::make_unique( + Tensor::ElementType::kFloat32, Tensor::Shape{1, num_boxes_ * 2}); + + return ::mediapipe::OkStatus(); + })); + +#elif MEDIAPIPE_METAL_ENABLED + id device = gpu_helper_.mtlDevice; + + // A shader to decode detection boxes. + std::string decode_src = absl::Substitute( + R"( +#include + +using namespace metal; + +kernel void decodeKernel( + device float* boxes [[ buffer(0) ]], + device float* raw_boxes [[ buffer(1) ]], + device float* raw_anchors [[ buffer(2) ]], + uint2 gid [[ thread_position_in_grid ]]) { + + uint num_coords = uint($0); + int reverse_output_order = int($1); + int apply_exponential = int($2); + int box_coord_offset = int($3); + int num_keypoints = int($4); + int keypt_coord_offset = int($5); + int num_values_per_keypt = int($6); +)", + options_.num_coords(), // box xywh + options_.reverse_output_order() ? 1 : 0, + options_.apply_exponential_on_box_size() ? 1 : 0, + options_.box_coord_offset(), options_.num_keypoints(), + options_.keypoint_coord_offset(), options_.num_values_per_keypoint()); + decode_src += absl::Substitute( + R"( + float4 scale = float4(($0),($1),($2),($3)); +)", + options_.x_scale(), options_.y_scale(), options_.w_scale(), + options_.h_scale()); + decode_src += R"( + uint g_idx = gid.x; + uint box_offset = g_idx * num_coords + uint(box_coord_offset); + uint anchor_offset = g_idx * uint(4); // check kNumCoordsPerBox + + float y_center, x_center, h, w; + + if (reverse_output_order == int(0)) { + y_center = raw_boxes[box_offset + uint(0)]; + x_center = raw_boxes[box_offset + uint(1)]; + h = raw_boxes[box_offset + uint(2)]; + w = raw_boxes[box_offset + uint(3)]; + } else { + x_center = raw_boxes[box_offset + uint(0)]; + y_center = raw_boxes[box_offset + uint(1)]; + w = raw_boxes[box_offset + uint(2)]; + h = raw_boxes[box_offset + uint(3)]; + } + + float anchor_yc = raw_anchors[anchor_offset + uint(0)]; + float anchor_xc = raw_anchors[anchor_offset + uint(1)]; + float anchor_h = raw_anchors[anchor_offset + uint(2)]; + float anchor_w = raw_anchors[anchor_offset + uint(3)]; + + x_center = x_center / scale.x * anchor_w + anchor_xc; + y_center = y_center / scale.y * anchor_h + anchor_yc; + + if (apply_exponential == int(1)) { + h = exp(h / scale.w) * anchor_h; + w = exp(w / scale.z) * anchor_w; + } else { + h = (h / scale.w) * anchor_h; + w = (w / scale.z) * anchor_w; + } + + float ymin = y_center - h / 2.0; + float xmin = x_center - w / 2.0; + float ymax = y_center + h / 2.0; + float xmax = x_center + w / 2.0; + + boxes[box_offset + uint(0)] = ymin; + boxes[box_offset + uint(1)] = xmin; + boxes[box_offset + uint(2)] = ymax; + boxes[box_offset + uint(3)] = xmax; + + if (num_keypoints > int(0)){ + for (int k = 0; k < num_keypoints; ++k) { + int kp_offset = + int(g_idx * num_coords) + keypt_coord_offset + k * num_values_per_keypt; + float kp_y, kp_x; + if (reverse_output_order == int(0)) { + kp_y = raw_boxes[kp_offset + int(0)]; + kp_x = raw_boxes[kp_offset + int(1)]; + } else { + kp_x = raw_boxes[kp_offset + int(0)]; + kp_y = raw_boxes[kp_offset + int(1)]; + } + boxes[kp_offset + int(0)] = kp_x / scale.x * anchor_w + anchor_xc; + boxes[kp_offset + int(1)] = kp_y / scale.y * anchor_h + anchor_yc; + } + } +})"; + + { + // Shader program + NSString* library_source = + [NSString stringWithUTF8String:decode_src.c_str()]; + NSError* error = nil; + id library = [device newLibraryWithSource:library_source + options:nullptr + error:&error]; + RET_CHECK(library != nil) << "Couldn't create shader library " + << [[error localizedDescription] UTF8String]; + id kernel_func = nil; + kernel_func = [library newFunctionWithName:@"decodeKernel"]; + RET_CHECK(kernel_func != nil) << "Couldn't create kernel function."; + decode_program_ = + [device newComputePipelineStateWithFunction:kernel_func error:&error]; + RET_CHECK(decode_program_ != nil) << "Couldn't create pipeline state " << + [[error localizedDescription] UTF8String]; + // Outputs + decoded_boxes_buffer_ = + absl::make_unique(Tensor::ElementType::kFloat32, + Tensor::Shape{1, num_boxes_ * num_coords_}); + // Inputs + raw_anchors_buffer_ = absl::make_unique( + Tensor::ElementType::kFloat32, + Tensor::Shape{1, num_boxes_ * kNumCoordsPerBox}); + } + + // A shader to score detection boxes. + const std::string score_src = absl::Substitute( + R"( +#include + +using namespace metal; + +float optional_sigmoid(float x) { + int apply_sigmoid = int($1); + int apply_clipping_thresh = int($2); + float clipping_thresh = float($3); + if (apply_sigmoid == int(0)) return x; + if (apply_clipping_thresh == int(1)) { + x = clamp(x, -clipping_thresh, clipping_thresh); + } + x = 1.0 / (1.0 + exp(-x)); + return x; +} + +kernel void scoreKernel( + device float* scored_boxes [[ buffer(0) ]], + device float* raw_scores [[ buffer(1) ]], + uint2 tid [[ thread_position_in_threadgroup ]], + uint2 gid [[ thread_position_in_grid ]]) { + + uint num_classes = uint($0); + int apply_sigmoid = int($1); + int apply_clipping_thresh = int($2); + float clipping_thresh = float($3); + int ignore_class_0 = int($4); + + uint g_idx = gid.x; // box idx + uint s_idx = tid.y; // score/class idx + + // load all scores into shared memory + threadgroup float local_scores[$0]; + float score = raw_scores[g_idx * num_classes + s_idx]; + local_scores[s_idx] = optional_sigmoid(score); + threadgroup_barrier(mem_flags::mem_threadgroup); + + // find max score in shared memory + if (s_idx == uint(0)) { + float max_score = -FLT_MAX; + float max_class = -1.0; + for (int i=ignore_class_0; i max_score) { + max_score = local_scores[i]; + max_class = float(i); + } + } + scored_boxes[g_idx * uint(2) + uint(0)] = max_score; + scored_boxes[g_idx * uint(2) + uint(1)] = max_class; + } +})", + num_classes_, options_.sigmoid_score() ? 1 : 0, + options_.has_score_clipping_thresh() ? 1 : 0, + options_.has_score_clipping_thresh() ? options_.score_clipping_thresh() + : 0, + ignore_classes_.size() ? 1 : 0); + + // TODO support better filtering. + CHECK_LE(ignore_classes_.size(), 1) << "Only ignore class 0 is allowed"; + + { + // Shader program + NSString* library_source = + [NSString stringWithUTF8String:score_src.c_str()]; + NSError* error = nil; + id library = [device newLibraryWithSource:library_source + options:nullptr + error:&error]; + RET_CHECK(library != nil) << "Couldn't create shader library " + << [[error localizedDescription] UTF8String]; + id kernel_func = nil; + kernel_func = [library newFunctionWithName:@"scoreKernel"]; + RET_CHECK(kernel_func != nil) << "Couldn't create kernel function."; + score_program_ = + [device newComputePipelineStateWithFunction:kernel_func error:&error]; + RET_CHECK(score_program_ != nil) << "Couldn't create pipeline state " << + [[error localizedDescription] UTF8String]; + // Outputs + scored_boxes_buffer_ = absl::make_unique( + Tensor::ElementType::kFloat32, Tensor::Shape{1, num_boxes_ * 2}); + // # filter classes supported is hardware dependent. + int max_wg_size = score_program_.maxTotalThreadsPerThreadgroup; + CHECK_LT(num_classes_, max_wg_size) << "# classes must be <" << max_wg_size; + } + +#endif // !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) + + return ::mediapipe::OkStatus(); +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/tensors_to_detections_calculator.proto b/mediapipe/calculators/tensor/tensors_to_detections_calculator.proto new file mode 100644 index 0000000000..24c0a50533 --- /dev/null +++ b/mediapipe/calculators/tensor/tensors_to_detections_calculator.proto @@ -0,0 +1,74 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The option proto for the TensorsToDetectionsCalculator. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message TensorsToDetectionsCalculatorOptions { + extend .mediapipe.CalculatorOptions { + optional TensorsToDetectionsCalculatorOptions ext = 335742639; + } + + // [Required] The number of output classes predicted by the detection model. + optional int32 num_classes = 1; + // [Required] The number of output boxes predicted by the detection model. + optional int32 num_boxes = 2; + // [Required] The number of output values per boxes predicted by the detection + // model. The values contain bounding boxes, keypoints, etc. + optional int32 num_coords = 3; + + // The offset of keypoint coordinates in the location tensor. + optional int32 keypoint_coord_offset = 9; + // The number of predicted keypoints. + optional int32 num_keypoints = 10 [default = 0]; + // The dimension of each keypoint, e.g. number of values predicted for each + // keypoint. + optional int32 num_values_per_keypoint = 11 [default = 2]; + // The offset of box coordinates in the location tensor. + optional int32 box_coord_offset = 12 [default = 0]; + + // Parameters for decoding SSD detection model. + optional float x_scale = 4 [default = 0.0]; + optional float y_scale = 5 [default = 0.0]; + optional float w_scale = 6 [default = 0.0]; + optional float h_scale = 7 [default = 0.0]; + + optional bool apply_exponential_on_box_size = 13 [default = false]; + + // Whether to reverse the order of predicted x, y from output. + // If false, the order is [y_center, x_center, h, w], if true the order is + // [x_center, y_center, w, h]. + optional bool reverse_output_order = 14 [default = false]; + // The ids of classes that should be ignored during decoding the score for + // each predicted box. + repeated int32 ignore_classes = 8; + + optional bool sigmoid_score = 15 [default = false]; + optional float score_clipping_thresh = 16; + + // Whether the detection coordinates from the input tensors should be flipped + // vertically (along the y-direction). This is useful, for example, when the + // input tensors represent detections defined with a coordinate system where + // the origin is at the top-left corner, whereas the desired detection + // representation has a bottom-left origin (e.g., in OpenGL). + optional bool flip_vertically = 18 [default = false]; + + // Score threshold for perserving decoded detections. + optional float min_score_thresh = 19; +} diff --git a/mediapipe/calculators/tensor/tensors_to_floats_calculator.cc b/mediapipe/calculators/tensor/tensors_to_floats_calculator.cc new file mode 100644 index 0000000000..74731ebb1b --- /dev/null +++ b/mediapipe/calculators/tensor/tensors_to_floats_calculator.cc @@ -0,0 +1,97 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port/ret_check.h" + +namespace mediapipe { + +// A calculator for converting Tensors to to a float or a float vector. +// +// Input: +// TENSORS - Vector of Tensors of type kFloat32. Only the first +// tensor will be used. +// Output: +// FLOAT(optional) - Converted single float number. +// FLOATS(optional) - Converted float vector. +// +// Notes: To output FLOAT stream, the input tensor must have size 1, e.g. +// only 1 float number in the tensor. +// +// Usage example: +// node { +// calculator: "TensorsToFloatsCalculator" +// input_stream: "TENSORS:tensors" +// output_stream: "FLOATS:floats" +// } +class TensorsToFloatsCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + + ::mediapipe::Status Process(CalculatorContext* cc) override; +}; +REGISTER_CALCULATOR(TensorsToFloatsCalculator); + +::mediapipe::Status TensorsToFloatsCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(cc->Inputs().HasTag("TENSORS")); + RET_CHECK(cc->Outputs().HasTag("FLOATS") || cc->Outputs().HasTag("FLOAT")); + + cc->Inputs().Tag("TENSORS").Set>(); + if (cc->Outputs().HasTag("FLOATS")) { + cc->Outputs().Tag("FLOATS").Set>(); + } + if (cc->Outputs().HasTag("FLOAT")) { + cc->Outputs().Tag("FLOAT").Set(); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToFloatsCalculator::Open(CalculatorContext* cc) { + cc->SetOffset(TimestampDiff(0)); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToFloatsCalculator::Process(CalculatorContext* cc) { + RET_CHECK(!cc->Inputs().Tag("TENSORS").IsEmpty()); + + const auto& input_tensors = + cc->Inputs().Tag("TENSORS").Get>(); + // TODO: Add option to specify which tensor to take from. + auto view = input_tensors[0].GetCpuReadView(); + auto raw_floats = view.buffer(); + int num_values = input_tensors[0].shape().num_elements(); + + if (cc->Outputs().HasTag("FLOAT")) { + // TODO: Could add an index in the option to specifiy returning one + // value of a float array. + RET_CHECK_EQ(num_values, 1); + cc->Outputs().Tag("FLOAT").AddPacket( + MakePacket(raw_floats[0]).At(cc->InputTimestamp())); + } + if (cc->Outputs().HasTag("FLOATS")) { + auto output_floats = absl::make_unique>( + raw_floats, raw_floats + num_values); + cc->Outputs().Tag("FLOATS").Add(output_floats.release(), + cc->InputTimestamp()); + } + + return ::mediapipe::OkStatus(); +} +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/tensors_to_landmarks_calculator.cc b/mediapipe/calculators/tensor/tensors_to_landmarks_calculator.cc new file mode 100644 index 0000000000..731e904ad6 --- /dev/null +++ b/mediapipe/calculators/tensor/tensors_to_landmarks_calculator.cc @@ -0,0 +1,250 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/calculators/tensor/tensors_to_landmarks_calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/landmark.pb.h" +#include "mediapipe/framework/formats/tensor.h" +#include "mediapipe/framework/port/ret_check.h" + +namespace mediapipe { + +// A calculator for converting Tensors from regression models into landmarks. +// Note that if the landmarks in the tensor has more than 5 dimensions, only the +// first 5 dimensions will be converted to [x,y,z, visibility, presence]. +// +// Input: +// TENSORS - Vector of Tensors of type kFloat32. Only the first tensor will be +// used. The size of the values must be (num_dimension x num_landmarks). +// +// FLIP_HORIZONTALLY (optional): Whether to flip landmarks horizontally or +// not. Overrides corresponding side packet and/or field in the calculator +// options. +// +// FLIP_VERTICALLY (optional): Whether to flip landmarks vertically or not. +// Overrides corresponding side packet and/or field in the calculator options. +// +// Input side packet: +// FLIP_HORIZONTALLY (optional): Whether to flip landmarks horizontally or +// not. Overrides the corresponding field in the calculator options. +// +// FLIP_VERTICALLY (optional): Whether to flip landmarks vertically or not. +// Overrides the corresponding field in the calculator options. +// +// Output: +// LANDMARKS(optional) - Result MediaPipe landmarks. +// NORM_LANDMARKS(optional) - Result MediaPipe normalized landmarks. +// +// Notes: +// To output normalized landmarks, user must provide the original input image +// size to the model using calculator option input_image_width and +// input_image_height. +// Usage example: +// node { +// calculator: "TensorsToLandmarksCalculator" +// input_stream: "TENSORS:landmark_tensors" +// output_stream: "LANDMARKS:landmarks" +// output_stream: "NORM_LANDMARKS:landmarks" +// options: { +// [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { +// num_landmarks: 21 +// +// input_image_width: 256 +// input_image_height: 256 +// } +// } +// } +class TensorsToLandmarksCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + + private: + ::mediapipe::Status LoadOptions(CalculatorContext* cc); + int num_landmarks_ = 0; + bool flip_vertically_ = false; + bool flip_horizontally_ = false; + + ::mediapipe::TensorsToLandmarksCalculatorOptions options_; +}; +REGISTER_CALCULATOR(TensorsToLandmarksCalculator); + +::mediapipe::Status TensorsToLandmarksCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + + if (cc->Inputs().HasTag("TENSORS")) { + cc->Inputs().Tag("TENSORS").Set>(); + } + + if (cc->Inputs().HasTag("FLIP_HORIZONTALLY")) { + cc->Inputs().Tag("FLIP_HORIZONTALLY").Set(); + } + + if (cc->Inputs().HasTag("FLIP_VERTICALLY")) { + cc->Inputs().Tag("FLIP_VERTICALLY").Set(); + } + + if (cc->InputSidePackets().HasTag("FLIP_HORIZONTALLY")) { + cc->InputSidePackets().Tag("FLIP_HORIZONTALLY").Set(); + } + + if (cc->InputSidePackets().HasTag("FLIP_VERTICALLY")) { + cc->InputSidePackets().Tag("FLIP_VERTICALLY").Set(); + } + + if (cc->Outputs().HasTag("LANDMARKS")) { + cc->Outputs().Tag("LANDMARKS").Set(); + } + + if (cc->Outputs().HasTag("NORM_LANDMARKS")) { + cc->Outputs().Tag("NORM_LANDMARKS").Set(); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToLandmarksCalculator::Open(CalculatorContext* cc) { + cc->SetOffset(TimestampDiff(0)); + + MP_RETURN_IF_ERROR(LoadOptions(cc)); + + if (cc->Outputs().HasTag("NORM_LANDMARKS")) { + RET_CHECK(options_.has_input_image_height() && + options_.has_input_image_width()) + << "Must provide input with/height for getting normalized landmarks."; + } + if (cc->Outputs().HasTag("LANDMARKS") && + (options_.flip_vertically() || options_.flip_horizontally() || + cc->InputSidePackets().HasTag("FLIP_HORIZONTALLY") || + cc->InputSidePackets().HasTag("FLIP_VERTICALLY"))) { + RET_CHECK(options_.has_input_image_height() && + options_.has_input_image_width()) + << "Must provide input with/height for using flip_vertically option " + "when outputing landmarks in absolute coordinates."; + } + + flip_horizontally_ = + cc->InputSidePackets().HasTag("FLIP_HORIZONTALLY") + ? cc->InputSidePackets().Tag("FLIP_HORIZONTALLY").Get() + : options_.flip_horizontally(); + + flip_vertically_ = + cc->InputSidePackets().HasTag("FLIP_VERTICALLY") + ? cc->InputSidePackets().Tag("FLIP_VERTICALLY").Get() + : options_.flip_vertically(); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToLandmarksCalculator::Process( + CalculatorContext* cc) { + // Override values if specified so. + if (cc->Inputs().HasTag("FLIP_HORIZONTALLY") && + !cc->Inputs().Tag("FLIP_HORIZONTALLY").IsEmpty()) { + flip_horizontally_ = cc->Inputs().Tag("FLIP_HORIZONTALLY").Get(); + } + if (cc->Inputs().HasTag("FLIP_VERTICALLY") && + !cc->Inputs().Tag("FLIP_VERTICALLY").IsEmpty()) { + flip_vertically_ = cc->Inputs().Tag("FLIP_VERTICALLY").Get(); + } + + if (cc->Inputs().Tag("TENSORS").IsEmpty()) { + return ::mediapipe::OkStatus(); + } + + const auto& input_tensors = + cc->Inputs().Tag("TENSORS").Get>(); + + int num_values = input_tensors[0].shape().num_elements(); + const int num_dimensions = num_values / num_landmarks_; + CHECK_GT(num_dimensions, 0); + + auto view = input_tensors[0].GetCpuReadView(); + auto raw_landmarks = view.buffer(); + + LandmarkList output_landmarks; + + for (int ld = 0; ld < num_landmarks_; ++ld) { + const int offset = ld * num_dimensions; + Landmark* landmark = output_landmarks.add_landmark(); + + if (flip_horizontally_) { + landmark->set_x(options_.input_image_width() - raw_landmarks[offset]); + } else { + landmark->set_x(raw_landmarks[offset]); + } + if (num_dimensions > 1) { + if (flip_vertically_) { + landmark->set_y(options_.input_image_height() - + raw_landmarks[offset + 1]); + } else { + landmark->set_y(raw_landmarks[offset + 1]); + } + } + if (num_dimensions > 2) { + landmark->set_z(raw_landmarks[offset + 2]); + } + if (num_dimensions > 3) { + landmark->set_visibility(raw_landmarks[offset + 3]); + } + if (num_dimensions > 4) { + landmark->set_presence(raw_landmarks[offset + 4]); + } + } + + // Output normalized landmarks if required. + if (cc->Outputs().HasTag("NORM_LANDMARKS")) { + NormalizedLandmarkList output_norm_landmarks; + for (int i = 0; i < output_landmarks.landmark_size(); ++i) { + const Landmark& landmark = output_landmarks.landmark(i); + NormalizedLandmark* norm_landmark = output_norm_landmarks.add_landmark(); + norm_landmark->set_x(landmark.x() / options_.input_image_width()); + norm_landmark->set_y(landmark.y() / options_.input_image_height()); + // Scale Z coordinate as X + allow additional uniform normalization. + norm_landmark->set_z(landmark.z() / options_.input_image_width() / + options_.normalize_z()); + norm_landmark->set_visibility(landmark.visibility()); + norm_landmark->set_presence(landmark.presence()); + } + cc->Outputs() + .Tag("NORM_LANDMARKS") + .AddPacket(MakePacket(output_norm_landmarks) + .At(cc->InputTimestamp())); + } + + // Output absolute landmarks. + if (cc->Outputs().HasTag("LANDMARKS")) { + cc->Outputs() + .Tag("LANDMARKS") + .AddPacket(MakePacket(output_landmarks) + .At(cc->InputTimestamp())); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TensorsToLandmarksCalculator::LoadOptions( + CalculatorContext* cc) { + // Get calculator options specified in the graph. + options_ = cc->Options<::mediapipe::TensorsToLandmarksCalculatorOptions>(); + RET_CHECK(options_.has_num_landmarks()); + num_landmarks_ = options_.num_landmarks(); + + return ::mediapipe::OkStatus(); +} +} // namespace mediapipe diff --git a/mediapipe/calculators/tensor/tensors_to_landmarks_calculator.proto b/mediapipe/calculators/tensor/tensors_to_landmarks_calculator.proto new file mode 100644 index 0000000000..c321fe8d16 --- /dev/null +++ b/mediapipe/calculators/tensor/tensors_to_landmarks_calculator.proto @@ -0,0 +1,54 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The option proto for the TensorsToLandmarksCalculator. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message TensorsToLandmarksCalculatorOptions { + extend .mediapipe.CalculatorOptions { + optional TensorsToLandmarksCalculatorOptions ext = 335742640; + } + + // [Required] Number of landmarks from the output of the model. + optional int32 num_landmarks = 1; + + // Size of the input image for the model. These options are used only when + // normalized landmarks are needed. Z coordinate is scaled as X assuming + // a weak perspective projection camera model. + optional int32 input_image_width = 2; + optional int32 input_image_height = 3; + + // Whether the detection coordinates from the input tensors should be flipped + // vertically (along the y-direction). This is useful, for example, when the + // input tensors represent detections defined with a coordinate system where + // the origin is at the top-left corner, whereas the desired detection + // representation has a bottom-left origin (e.g., in OpenGL). + optional bool flip_vertically = 4 [default = false]; + + // Whether the detection coordinates from the input tensors should be flipped + // horizontally (along the x-direction). This is useful, for example, when the + // input image is horizontally flipped in ImageTransformationCalculator + // beforehand. + optional bool flip_horizontally = 6 [default = false]; + + // A value that Z coordinates should be divided by. This option is used only + // when normalized landmarks are needed. It is applied in addition to Z + // coordinate being re-scaled as X. + optional float normalize_z = 5 [default = 1.0]; +} diff --git a/mediapipe/calculators/tensor/testdata/add.bin b/mediapipe/calculators/tensor/testdata/add.bin new file mode 100644 index 0000000000..b4c02350c0 Binary files /dev/null and b/mediapipe/calculators/tensor/testdata/add.bin differ diff --git a/mediapipe/calculators/tensor/testdata/image_to_tensor/input.jpg b/mediapipe/calculators/tensor/testdata/image_to_tensor/input.jpg new file mode 100644 index 0000000000..37d6c4b20a Binary files /dev/null and b/mediapipe/calculators/tensor/testdata/image_to_tensor/input.jpg differ diff --git a/mediapipe/calculators/tensor/testdata/image_to_tensor/large_sub_rect.png b/mediapipe/calculators/tensor/testdata/image_to_tensor/large_sub_rect.png new file mode 100644 index 0000000000..38a13dabe9 Binary files /dev/null and b/mediapipe/calculators/tensor/testdata/image_to_tensor/large_sub_rect.png differ diff --git a/mediapipe/calculators/tensor/testdata/image_to_tensor/large_sub_rect_keep_aspect.png b/mediapipe/calculators/tensor/testdata/image_to_tensor/large_sub_rect_keep_aspect.png new file mode 100644 index 0000000000..254dc72ae9 Binary files /dev/null and b/mediapipe/calculators/tensor/testdata/image_to_tensor/large_sub_rect_keep_aspect.png differ diff --git a/mediapipe/calculators/tensor/testdata/image_to_tensor/large_sub_rect_keep_aspect_with_rotation.png b/mediapipe/calculators/tensor/testdata/image_to_tensor/large_sub_rect_keep_aspect_with_rotation.png new file mode 100644 index 0000000000..104cb60915 Binary files /dev/null and b/mediapipe/calculators/tensor/testdata/image_to_tensor/large_sub_rect_keep_aspect_with_rotation.png differ diff --git a/mediapipe/calculators/tensor/testdata/image_to_tensor/medium_sub_rect_keep_aspect.png b/mediapipe/calculators/tensor/testdata/image_to_tensor/medium_sub_rect_keep_aspect.png new file mode 100644 index 0000000000..aba8d2591d Binary files /dev/null and b/mediapipe/calculators/tensor/testdata/image_to_tensor/medium_sub_rect_keep_aspect.png differ diff --git a/mediapipe/calculators/tensor/testdata/image_to_tensor/medium_sub_rect_keep_aspect_with_rotation.png b/mediapipe/calculators/tensor/testdata/image_to_tensor/medium_sub_rect_keep_aspect_with_rotation.png new file mode 100644 index 0000000000..5ce7c3ec36 Binary files /dev/null and b/mediapipe/calculators/tensor/testdata/image_to_tensor/medium_sub_rect_keep_aspect_with_rotation.png differ diff --git a/mediapipe/calculators/tensor/testdata/image_to_tensor/medium_sub_rect_with_rotation.png b/mediapipe/calculators/tensor/testdata/image_to_tensor/medium_sub_rect_with_rotation.png new file mode 100644 index 0000000000..ecfb1e5375 Binary files /dev/null and b/mediapipe/calculators/tensor/testdata/image_to_tensor/medium_sub_rect_with_rotation.png differ diff --git a/mediapipe/calculators/tensor/testdata/image_to_tensor/noop_except_range.png b/mediapipe/calculators/tensor/testdata/image_to_tensor/noop_except_range.png new file mode 100644 index 0000000000..1486d9f151 Binary files /dev/null and b/mediapipe/calculators/tensor/testdata/image_to_tensor/noop_except_range.png differ diff --git a/mediapipe/calculators/tensor/testdata/labelmap.txt b/mediapipe/calculators/tensor/testdata/labelmap.txt new file mode 100644 index 0000000000..4291e3c6b4 --- /dev/null +++ b/mediapipe/calculators/tensor/testdata/labelmap.txt @@ -0,0 +1,3 @@ +classA +classB +classC diff --git a/mediapipe/calculators/tensorflow/unpack_media_sequence_calculator.cc b/mediapipe/calculators/tensorflow/unpack_media_sequence_calculator.cc index 86a2a4afa0..8bd0273e0c 100644 --- a/mediapipe/calculators/tensorflow/unpack_media_sequence_calculator.cc +++ b/mediapipe/calculators/tensorflow/unpack_media_sequence_calculator.cc @@ -84,7 +84,7 @@ namespace mpms = ::mediapipe::mediasequence; // node { // calculator: "UnpackMediaSequenceCalculator" // input_side_packet: "SEQUENCE_EXAMPLE:example_input_side_packet" -// input_side_packet: "ROOT_DIRECTORY:path_to_dataset_root_directory" +// input_side_packet: "DATASET_ROOT:path_to_dataset_root_directory" // output_side_packet: "DATA_PATH:full_path_to_data_element" // output_side_packet: "RESAMPLER_OPTIONS:packet_resampler_options" // options { diff --git a/mediapipe/calculators/tflite/tflite_inference_calculator.cc b/mediapipe/calculators/tflite/tflite_inference_calculator.cc index 29bc4a59ff..314637e591 100644 --- a/mediapipe/calculators/tflite/tflite_inference_calculator.cc +++ b/mediapipe/calculators/tflite/tflite_inference_calculator.cc @@ -404,12 +404,7 @@ ::mediapipe::Status TfLiteInferenceCalculator::Open(CalculatorContext* cc) { MP_RETURN_IF_ERROR(LoadDelegate(cc)); #endif } else { - // TODO: why only on these platforms? - // It seems that the XNNPACK delegate fails to load on Linux. -#if defined(__EMSCRIPTEN__) || defined(MEDIAPIPE_ANDROID) || \ - defined(MEDIAPIPE_IOS) MP_RETURN_IF_ERROR(LoadDelegate(cc)); -#endif // __EMSCRIPTEN__ || MEDIAPIPE_ANDROID || MEDIAPIPE_IOS } return ::mediapipe::OkStatus(); } diff --git a/mediapipe/calculators/util/BUILD b/mediapipe/calculators/util/BUILD index b515c57293..7914390e97 100644 --- a/mediapipe/calculators/util/BUILD +++ b/mediapipe/calculators/util/BUILD @@ -929,6 +929,7 @@ cc_library( deps = [ ":collection_has_min_size_calculator_cc_proto", "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:classification_cc_proto", "//mediapipe/framework/formats:landmark_cc_proto", "//mediapipe/framework/formats:rect_cc_proto", "//mediapipe/framework/port:ret_check", @@ -1043,3 +1044,26 @@ cc_library( ], alwayslink = 1, ) + +mediapipe_proto_library( + name = "logic_calculator_proto", + srcs = ["logic_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_options_proto", + "//mediapipe/framework:calculator_proto", + ], +) + +cc_library( + name = "logic_calculator", + srcs = ["logic_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":logic_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + ], + alwayslink = 1, +) diff --git a/mediapipe/calculators/util/collection_has_min_size_calculator.cc b/mediapipe/calculators/util/collection_has_min_size_calculator.cc index 22bfb9c4ca..956818c87b 100644 --- a/mediapipe/calculators/util/collection_has_min_size_calculator.cc +++ b/mediapipe/calculators/util/collection_has_min_size_calculator.cc @@ -17,18 +17,24 @@ #include +#include "mediapipe/framework/formats/classification.pb.h" #include "mediapipe/framework/formats/landmark.pb.h" #include "mediapipe/framework/formats/rect.pb.h" namespace mediapipe { -typedef CollectionHasMinSizeCalculator> +typedef CollectionHasMinSizeCalculator> NormalizedRectVectorHasMinSizeCalculator; REGISTER_CALCULATOR(NormalizedRectVectorHasMinSizeCalculator); typedef CollectionHasMinSizeCalculator< - std::vector<::mediapipe::NormalizedLandmarkList>> + std::vector> NormalizedLandmarkListVectorHasMinSizeCalculator; REGISTER_CALCULATOR(NormalizedLandmarkListVectorHasMinSizeCalculator); +typedef CollectionHasMinSizeCalculator< + std::vector> + ClassificationListVectorHasMinSizeCalculator; +REGISTER_CALCULATOR(ClassificationListVectorHasMinSizeCalculator); + } // namespace mediapipe diff --git a/mediapipe/calculators/util/detections_to_rects_calculator.cc b/mediapipe/calculators/util/detections_to_rects_calculator.cc index 52ba9dd7ab..a126ed8cbb 100644 --- a/mediapipe/calculators/util/detections_to_rects_calculator.cc +++ b/mediapipe/calculators/util/detections_to_rects_calculator.cc @@ -14,6 +14,7 @@ #include "mediapipe/calculators/util/detections_to_rects_calculator.h" #include +#include #include "mediapipe/calculators/util/detections_to_rects_calculator.pb.h" #include "mediapipe/framework/calculator_framework.h" @@ -36,19 +37,70 @@ constexpr char kNormRectTag[] = "NORM_RECT"; constexpr char kRectsTag[] = "RECTS"; constexpr char kNormRectsTag[] = "NORM_RECTS"; +constexpr float kMinFloat = std::numeric_limits::lowest(); +constexpr float kMaxFloat = std::numeric_limits::max(); + +::mediapipe::Status NormRectFromKeyPoints(const LocationData& location_data, + NormalizedRect* rect) { + RET_CHECK_GT(location_data.relative_keypoints_size(), 1) + << "2 or more key points required to calculate a rect."; + float xmin = kMaxFloat; + float ymin = kMaxFloat; + float xmax = kMinFloat; + float ymax = kMinFloat; + for (int i = 0; i < location_data.relative_keypoints_size(); ++i) { + const auto& kp = location_data.relative_keypoints(i); + xmin = std::min(xmin, kp.x()); + ymin = std::min(ymin, kp.y()); + xmax = std::max(xmax, kp.x()); + ymax = std::max(ymax, kp.y()); + } + rect->set_x_center((xmin + xmax) / 2); + rect->set_y_center((ymin + ymax) / 2); + rect->set_width(xmax - xmin); + rect->set_height(ymax - ymin); + return ::mediapipe::OkStatus(); +} + +template +void RectFromBox(B box, R* rect) { + rect->set_x_center(box.xmin() + box.width() / 2); + rect->set_y_center(box.ymin() + box.height() / 2); + rect->set_width(box.width()); + rect->set_height(box.height()); +} + } // namespace ::mediapipe::Status DetectionsToRectsCalculator::DetectionToRect( const Detection& detection, const DetectionSpec& detection_spec, Rect* rect) { const LocationData location_data = detection.location_data(); - RET_CHECK(location_data.format() == LocationData::BOUNDING_BOX) - << "Only Detection with formats of BOUNDING_BOX can be converted to Rect"; - const LocationData::BoundingBox bounding_box = location_data.bounding_box(); - rect->set_x_center(bounding_box.xmin() + bounding_box.width() / 2); - rect->set_y_center(bounding_box.ymin() + bounding_box.height() / 2); - rect->set_width(bounding_box.width()); - rect->set_height(bounding_box.height()); + switch (options_.conversion_mode()) { + case mediapipe::DetectionsToRectsCalculatorOptions_ConversionMode_DEFAULT: + case mediapipe:: + DetectionsToRectsCalculatorOptions_ConversionMode_USE_BOUNDING_BOX: { + RET_CHECK(location_data.format() == LocationData::BOUNDING_BOX) + << "Only Detection with formats of BOUNDING_BOX can be converted to " + "Rect"; + RectFromBox(location_data.bounding_box(), rect); + break; + } + case mediapipe:: + DetectionsToRectsCalculatorOptions_ConversionMode_USE_KEYPOINTS: { + RET_CHECK(detection_spec.image_size.has_value()) + << "Rect with absolute coordinates calculation requires image size."; + const int width = detection_spec.image_size->first; + const int height = detection_spec.image_size->second; + NormalizedRect norm_rect; + MP_RETURN_IF_ERROR(NormRectFromKeyPoints(location_data, &norm_rect)); + rect->set_x_center(std::round(norm_rect.x_center() * width)); + rect->set_y_center(std::round(norm_rect.y_center() * height)); + rect->set_width(std::round(norm_rect.width() * width)); + rect->set_height(std::round(norm_rect.height() * height)); + break; + } + } return ::mediapipe::OkStatus(); } @@ -56,15 +108,22 @@ ::mediapipe::Status DetectionsToRectsCalculator::DetectionToNormalizedRect( const Detection& detection, const DetectionSpec& detection_spec, NormalizedRect* rect) { const LocationData location_data = detection.location_data(); - RET_CHECK(location_data.format() == LocationData::RELATIVE_BOUNDING_BOX) - << "Only Detection with formats of RELATIVE_BOUNDING_BOX can be " - "converted to NormalizedRect"; - const LocationData::RelativeBoundingBox bounding_box = - location_data.relative_bounding_box(); - rect->set_x_center(bounding_box.xmin() + bounding_box.width() / 2); - rect->set_y_center(bounding_box.ymin() + bounding_box.height() / 2); - rect->set_width(bounding_box.width()); - rect->set_height(bounding_box.height()); + switch (options_.conversion_mode()) { + case mediapipe::DetectionsToRectsCalculatorOptions_ConversionMode_DEFAULT: + case mediapipe:: + DetectionsToRectsCalculatorOptions_ConversionMode_USE_BOUNDING_BOX: { + RET_CHECK(location_data.format() == LocationData::RELATIVE_BOUNDING_BOX) + << "Only Detection with formats of RELATIVE_BOUNDING_BOX can be " + "converted to NormalizedRect"; + RectFromBox(location_data.relative_bounding_box(), rect); + break; + } + case mediapipe:: + DetectionsToRectsCalculatorOptions_ConversionMode_USE_KEYPOINTS: { + MP_RETURN_IF_ERROR(NormRectFromKeyPoints(location_data, rect)); + break; + } + } return ::mediapipe::OkStatus(); } diff --git a/mediapipe/calculators/util/detections_to_rects_calculator.proto b/mediapipe/calculators/util/detections_to_rects_calculator.proto index 8d1a49a1e0..d49eb6c520 100644 --- a/mediapipe/calculators/util/detections_to_rects_calculator.proto +++ b/mediapipe/calculators/util/detections_to_rects_calculator.proto @@ -35,4 +35,12 @@ message DetectionsToRectsCalculatorOptions { // Whether to output a zero-rect (with origin and size both zero) when the // input detection vector is empty. optional bool output_zero_rect_for_empty_detections = 5; + + enum ConversionMode { + DEFAULT = 0; + USE_BOUNDING_BOX = 1; + USE_KEYPOINTS = 2; + } + + optional ConversionMode conversion_mode = 6; } diff --git a/mediapipe/calculators/util/detections_to_rects_calculator_test.cc b/mediapipe/calculators/util/detections_to_rects_calculator_test.cc index 7281847ca3..e526a85322 100644 --- a/mediapipe/calculators/util/detections_to_rects_calculator_test.cc +++ b/mediapipe/calculators/util/detections_to_rects_calculator_test.cc @@ -12,6 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include +#include + #include "mediapipe/framework/calculator.pb.h" #include "mediapipe/framework/calculator_framework.h" #include "mediapipe/framework/calculator_runner.h" @@ -26,6 +30,21 @@ #include "mediapipe/framework/port/status_matchers.h" namespace mediapipe { +namespace { + +MATCHER_P4(RectEq, x_center, y_center, width, height, "") { + return testing::Value(arg.x_center(), testing::Eq(x_center)) && + testing::Value(arg.y_center(), testing::Eq(y_center)) && + testing::Value(arg.width(), testing::Eq(width)) && + testing::Value(arg.height(), testing::Eq(height)); +} + +MATCHER_P4(NormRectEq, x_center, y_center, width, height, "") { + return testing::Value(arg.x_center(), testing::FloatEq(x_center)) && + testing::Value(arg.y_center(), testing::FloatEq(y_center)) && + testing::Value(arg.width(), testing::FloatEq(width)) && + testing::Value(arg.height(), testing::FloatEq(height)); +} Detection DetectionWithLocationData(int32 xmin, int32 ymin, int32 width, int32 height) { @@ -39,6 +58,19 @@ Detection DetectionWithLocationData(int32 xmin, int32 ymin, int32 width, return detection; } +Detection DetectionWithKeyPoints( + const std::vector>& key_points) { + Detection detection; + LocationData* location_data = detection.mutable_location_data(); + std::for_each(key_points.begin(), key_points.end(), + [location_data](std::pair kp) { + auto* new_kp = location_data->add_relative_keypoints(); + new_kp->set_x(kp.first); + new_kp->set_y(kp.second); + }); + return detection; +} + Detection DetectionWithRelativeLocationData(double xmin, double ymin, double width, double height) { Detection detection; @@ -70,10 +102,61 @@ TEST(DetectionsToRectsCalculatorTest, DetectionToRect) { const std::vector& output = runner.Outputs().Tag("RECT").packets; ASSERT_EQ(1, output.size()); const auto& rect = output[0].Get(); - EXPECT_EQ(rect.width(), 300); - EXPECT_EQ(rect.height(), 400); - EXPECT_EQ(rect.x_center(), 250); - EXPECT_EQ(rect.y_center(), 400); + EXPECT_THAT(rect, RectEq(250, 400, 300, 400)); +} + +::mediapipe::StatusOr RunDetectionKeyPointsToRectCalculation( + Detection detection, std::pair image_size) { + CalculatorRunner runner(ParseTextProtoOrDie(R"( + calculator: "DetectionsToRectsCalculator" + input_stream: "DETECTION:detection" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "RECT:rect" + options: { + [mediapipe.DetectionsToRectsCalculatorOptions.ext] { + conversion_mode: USE_KEYPOINTS + } + } + )")); + + runner.MutableInputs() + ->Tag("DETECTION") + .packets.push_back(MakePacket(std::move(detection)) + .At(Timestamp::PostStream())); + runner.MutableInputs() + ->Tag("IMAGE_SIZE") + .packets.push_back(MakePacket>(image_size) + .At(Timestamp::PostStream())); + + MP_RETURN_IF_ERROR(runner.Run()); + const std::vector& output = runner.Outputs().Tag("RECT").packets; + RET_CHECK_EQ(output.size(), 1); + return output[0].Get(); +} + +TEST(DetectionsToRectsCalculatorTest, DetectionKeyPointsToRect) { + auto status_or_value = RunDetectionKeyPointsToRectCalculation( + /*detection=*/DetectionWithKeyPoints({{0.0f, 0.0f}, {1.0f, 1.0f}}), + /*image_size=*/{640, 480}); + EXPECT_THAT(status_or_value.ValueOrDie(), RectEq(320, 240, 640, 480)); + + status_or_value = RunDetectionKeyPointsToRectCalculation( + /*detection=*/DetectionWithKeyPoints({{0.25f, 0.25f}, {0.75f, 0.75f}}), + /*image_size=*/{640, 480}); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), RectEq(320, 240, 320, 240)); + + status_or_value = RunDetectionKeyPointsToRectCalculation( + /*detection=*/DetectionWithKeyPoints({{0.0f, 0.0f}, {0.5f, 0.5f}}), + /*image_size=*/{640, 480}); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), RectEq(160, 120, 320, 240)); + + status_or_value = RunDetectionKeyPointsToRectCalculation( + /*detection=*/DetectionWithKeyPoints({{0.5f, 0.5f}, {1.0f, 1.0f}}), + /*image_size=*/{640, 480}); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), RectEq(480, 360, 320, 240)); } TEST(DetectionsToRectsCalculatorTest, DetectionToNormalizedRect) { @@ -95,10 +178,56 @@ TEST(DetectionsToRectsCalculatorTest, DetectionToNormalizedRect) { const std::vector& output = runner.Outputs().Tag("NORM_RECT").packets; ASSERT_EQ(1, output.size()); const auto& rect = output[0].Get(); - EXPECT_FLOAT_EQ(rect.width(), 0.3); - EXPECT_FLOAT_EQ(rect.height(), 0.4); - EXPECT_FLOAT_EQ(rect.x_center(), 0.25); - EXPECT_FLOAT_EQ(rect.y_center(), 0.4); + EXPECT_THAT(rect, NormRectEq(0.25f, 0.4f, 0.3f, 0.4f)); +} + +::mediapipe::StatusOr +RunDetectionKeyPointsToNormRectCalculation(Detection detection) { + CalculatorRunner runner(ParseTextProtoOrDie(R"( + calculator: "DetectionsToRectsCalculator" + input_stream: "DETECTION:detection" + output_stream: "NORM_RECT:rect" + options: { + [mediapipe.DetectionsToRectsCalculatorOptions.ext] { + conversion_mode: USE_KEYPOINTS + } + } + )")); + + runner.MutableInputs() + ->Tag("DETECTION") + .packets.push_back(MakePacket(std::move(detection)) + .At(Timestamp::PostStream())); + + MP_RETURN_IF_ERROR(runner.Run()); + const std::vector& output = runner.Outputs().Tag("NORM_RECT").packets; + RET_CHECK_EQ(output.size(), 1); + return output[0].Get(); +} + +TEST(DetectionsToRectsCalculatorTest, DetectionKeyPointsToNormalizedRect) { + NormalizedRect rect; + + auto status_or_value = RunDetectionKeyPointsToNormRectCalculation( + /*detection=*/DetectionWithKeyPoints( + {{0.0f, 0.0f}, {0.5f, 0.5f}, {1.0f, 1.0f}})); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), RectEq(0.5f, 0.5f, 1.0f, 1.0f)); + + status_or_value = RunDetectionKeyPointsToNormRectCalculation( + /*detection=*/DetectionWithKeyPoints( + {{0.25f, 0.25f}, {0.75f, 0.25f}, {0.75f, 0.75f}})); + EXPECT_THAT(status_or_value.ValueOrDie(), RectEq(0.5f, 0.5f, 0.5f, 0.5f)); + + status_or_value = RunDetectionKeyPointsToNormRectCalculation( + /*detection=*/DetectionWithKeyPoints({{0.0f, 0.0f}, {0.5f, 0.5f}})); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), RectEq(0.25f, 0.25f, 0.5f, 0.5f)); + + status_or_value = RunDetectionKeyPointsToNormRectCalculation( + /*detection=*/DetectionWithKeyPoints({{0.5f, 0.5f}, {1.0f, 1.0f}})); + MP_ASSERT_OK(status_or_value); + EXPECT_THAT(status_or_value.ValueOrDie(), RectEq(0.75f, 0.75f, 0.5f, 0.5f)); } TEST(DetectionsToRectsCalculatorTest, DetectionsToRect) { @@ -121,10 +250,7 @@ TEST(DetectionsToRectsCalculatorTest, DetectionsToRect) { const std::vector& output = runner.Outputs().Tag("RECT").packets; ASSERT_EQ(1, output.size()); const auto& rect = output[0].Get(); - EXPECT_EQ(rect.width(), 300); - EXPECT_EQ(rect.height(), 400); - EXPECT_EQ(rect.x_center(), 250); - EXPECT_EQ(rect.y_center(), 400); + EXPECT_THAT(rect, RectEq(250, 400, 300, 400)); } TEST(DetectionsToRectsCalculatorTest, DetectionsToNormalizedRect) { @@ -147,10 +273,7 @@ TEST(DetectionsToRectsCalculatorTest, DetectionsToNormalizedRect) { const std::vector& output = runner.Outputs().Tag("NORM_RECT").packets; ASSERT_EQ(1, output.size()); const auto& rect = output[0].Get(); - EXPECT_FLOAT_EQ(rect.width(), 0.3); - EXPECT_FLOAT_EQ(rect.height(), 0.4); - EXPECT_FLOAT_EQ(rect.x_center(), 0.25); - EXPECT_FLOAT_EQ(rect.y_center(), 0.4); + EXPECT_THAT(rect, NormRectEq(0.25f, 0.4f, 0.3f, 0.4f)); } TEST(DetectionsToRectsCalculatorTest, DetectionsToRects) { @@ -173,15 +296,9 @@ TEST(DetectionsToRectsCalculatorTest, DetectionsToRects) { const std::vector& output = runner.Outputs().Tag("RECTS").packets; ASSERT_EQ(1, output.size()); const auto& rects = output[0].Get>(); - EXPECT_EQ(rects.size(), 2); - EXPECT_EQ(rects[0].width(), 300); - EXPECT_EQ(rects[0].height(), 400); - EXPECT_EQ(rects[0].x_center(), 250); - EXPECT_EQ(rects[0].y_center(), 400); - EXPECT_EQ(rects[1].width(), 400); - EXPECT_EQ(rects[1].height(), 500); - EXPECT_EQ(rects[1].x_center(), 400); - EXPECT_EQ(rects[1].y_center(), 550); + ASSERT_EQ(rects.size(), 2); + EXPECT_THAT(rects[0], RectEq(250, 400, 300, 400)); + EXPECT_THAT(rects[1], RectEq(400, 550, 400, 500)); } TEST(DetectionsToRectsCalculatorTest, DetectionsToNormalizedRects) { @@ -205,15 +322,9 @@ TEST(DetectionsToRectsCalculatorTest, DetectionsToNormalizedRects) { runner.Outputs().Tag("NORM_RECTS").packets; ASSERT_EQ(1, output.size()); const auto& rects = output[0].Get>(); - EXPECT_EQ(rects.size(), 2); - EXPECT_FLOAT_EQ(rects[0].width(), 0.3); - EXPECT_FLOAT_EQ(rects[0].height(), 0.4); - EXPECT_FLOAT_EQ(rects[0].x_center(), 0.25); - EXPECT_FLOAT_EQ(rects[0].y_center(), 0.4); - EXPECT_FLOAT_EQ(rects[1].width(), 0.4); - EXPECT_FLOAT_EQ(rects[1].height(), 0.5); - EXPECT_FLOAT_EQ(rects[1].x_center(), 0.4); - EXPECT_FLOAT_EQ(rects[1].y_center(), 0.55); + ASSERT_EQ(rects.size(), 2); + EXPECT_THAT(rects[0], NormRectEq(0.25f, 0.4f, 0.3f, 0.4f)); + EXPECT_THAT(rects[1], NormRectEq(0.4f, 0.55f, 0.4f, 0.5f)); } TEST(DetectionsToRectsCalculatorTest, DetectionToRects) { @@ -236,10 +347,7 @@ TEST(DetectionsToRectsCalculatorTest, DetectionToRects) { ASSERT_EQ(1, output.size()); const auto& rects = output[0].Get>(); EXPECT_EQ(rects.size(), 1); - EXPECT_EQ(rects[0].width(), 300); - EXPECT_EQ(rects[0].height(), 400); - EXPECT_EQ(rects[0].x_center(), 250); - EXPECT_EQ(rects[0].y_center(), 400); + EXPECT_THAT(rects[0], RectEq(250, 400, 300, 400)); } TEST(DetectionsToRectsCalculatorTest, DetectionToNormalizedRects) { @@ -262,11 +370,8 @@ TEST(DetectionsToRectsCalculatorTest, DetectionToNormalizedRects) { runner.Outputs().Tag("NORM_RECTS").packets; ASSERT_EQ(1, output.size()); const auto& rects = output[0].Get>(); - EXPECT_EQ(rects.size(), 1); - EXPECT_FLOAT_EQ(rects[0].width(), 0.3); - EXPECT_FLOAT_EQ(rects[0].height(), 0.4); - EXPECT_FLOAT_EQ(rects[0].x_center(), 0.25); - EXPECT_FLOAT_EQ(rects[0].y_center(), 0.4); + ASSERT_EQ(rects.size(), 1); + EXPECT_THAT(rects[0], NormRectEq(0.25f, 0.4f, 0.3f, 0.4f)); } TEST(DetectionsToRectsCalculatorTest, WrongInputToRect) { @@ -309,4 +414,5 @@ TEST(DetectionsToRectsCalculatorTest, WrongInputToNormalizedRect) { "Only Detection with formats of RELATIVE_BOUNDING_BOX")); } +} // namespace } // namespace mediapipe diff --git a/mediapipe/calculators/util/logic_calculator.cc b/mediapipe/calculators/util/logic_calculator.cc new file mode 100644 index 0000000000..3b6ef81bdc --- /dev/null +++ b/mediapipe/calculators/util/logic_calculator.cc @@ -0,0 +1,105 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "mediapipe/calculators/util/logic_calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/status.h" + +namespace mediapipe { +using mediapipe::LogicCalculatorOptions; + +// A calculator to compute logical functions of bool inputs. +// With just one input, the output equals the input as expected. +// +// Inputs: One or more bool inputs, which may be input-stream-packets, +// input-side-packets, or options input-values. +// +// Outputs: One bool stream. +// +// Example config: +// node { +// calculator: "LogicCalculator" +// input_stream: "has_data" +// input_side_packet: "enable" +// input_stream: "is_valid" +// output_stream: "process_data" +// options { +// [mediapipe.LogicCalculatorOptions.ext] { +// op: AND +// input_value: true +// } +// } +// } +class LogicCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc) { + for (int k = 0; k < cc->Inputs().NumEntries(""); ++k) { + cc->Inputs().Index(k).Set(); + } + for (int k = 0; k < cc->InputSidePackets().NumEntries(""); ++k) { + cc->InputSidePackets().Index(k).Set(); + } + RET_CHECK_GE(cc->Inputs().NumEntries("") + + cc->InputSidePackets().NumEntries("") + + cc->Options().input_value_size(), + 1); + RET_CHECK_EQ(cc->Outputs().NumEntries(""), 1); + cc->Outputs().Index(0).Set(); + return ::mediapipe::OkStatus(); + } + + ::mediapipe::Status Open(CalculatorContext* cc) override { + options_ = cc->Options(); + cc->SetOffset(TimestampDiff(0)); + return ::mediapipe::OkStatus(); + } + + bool LogicalOp(bool b1, bool b2) { + switch (options_.op()) { + case LogicCalculatorOptions::AND: + return b1 && b2; + case LogicCalculatorOptions::OR: + return b1 || b2; + case LogicCalculatorOptions::XOR: + return b1 ^ b2; + } + return false; + } + + ::mediapipe::Status Process(CalculatorContext* cc) override { + bool result = options_.op() == LogicCalculatorOptions::AND ? true : false; + for (int k = 0; k < options_.input_value_size(); ++k) { + result = LogicalOp(result, options_.input_value(k)); + } + for (int k = 0; k < cc->Inputs().NumEntries(""); ++k) { + result = LogicalOp(result, cc->Inputs().Index(k).Value().Get()); + } + for (int k = 0; k < cc->InputSidePackets().NumEntries(""); ++k) { + result = LogicalOp(result, cc->InputSidePackets().Index(k).Get()); + } + if (options_.negate()) { + result = !result; + } + cc->Outputs().Index(0).Add(new bool(result), cc->InputTimestamp()); + return ::mediapipe::OkStatus(); + } + + private: + LogicCalculatorOptions options_; +}; +REGISTER_CALCULATOR(LogicCalculator); + +} // namespace mediapipe diff --git a/mediapipe/examples/ios/multihandtrackinggpu/MultiHandTrackingViewController.h b/mediapipe/calculators/util/logic_calculator.proto similarity index 52% rename from mediapipe/examples/ios/multihandtrackinggpu/MultiHandTrackingViewController.h rename to mediapipe/calculators/util/logic_calculator.proto index 17ea6feeb8..fe00a2d9b3 100644 --- a/mediapipe/examples/ios/multihandtrackinggpu/MultiHandTrackingViewController.h +++ b/mediapipe/calculators/util/logic_calculator.proto @@ -1,4 +1,4 @@ -// Copyright 2019 The MediaPipe Authors. +// Copyright 2020 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,10 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -#import +syntax = "proto2"; -#import "mediapipe/examples/ios/common/CommonViewController.h" +package mediapipe; -@interface MultiHandTrackingViewController : CommonViewController +import "mediapipe/framework/calculator.proto"; -@end +message LogicCalculatorOptions { + extend CalculatorOptions { + optional LogicCalculatorOptions ext = 338731246; + } + // The logical operation to apply. + enum Operation { + AND = 0; + OR = 1; + XOR = 2; + } + optional Operation op = 1; + + // Whether to negate the result. + optional bool negate = 2; + + // Optional bool input values. + repeated bool input_value = 3; +} diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectioncpu/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectioncpu/BUILD index 7536be08b4..7af950678f 100644 --- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectioncpu/BUILD +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectioncpu/BUILD @@ -36,9 +36,8 @@ android_binary( name = "facedetectioncpu", srcs = glob(["*.java"]), assets = [ - "//mediapipe/graphs/face_detection:mobile_cpu.binarypb", - "//mediapipe/models:face_detection_front.tflite", - "//mediapipe/models:face_detection_front_labelmap.txt", + "//mediapipe/graphs/face_detection:face_detection_mobile_cpu.binarypb", + "//mediapipe/modules/face_detection:face_detection_front.tflite", ], assets_dir = "", manifest = "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/basic:AndroidManifest.xml", @@ -47,7 +46,7 @@ android_binary( "appName": "Face Detection (CPU)", "mainActivity": "com.google.mediapipe.apps.basic.MainActivity", "cameraFacingFront": "True", - "binaryGraphName": "mobile_cpu.binarypb", + "binaryGraphName": "face_detection_mobile_cpu.binarypb", "inputVideoStreamName": "input_video", "outputVideoStreamName": "output_video", "flipFramesVertically": "True", diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectiongpu/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectiongpu/BUILD index 46a758ab6d..60d4ef44fa 100644 --- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectiongpu/BUILD +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectiongpu/BUILD @@ -36,9 +36,8 @@ android_binary( name = "facedetectiongpu", srcs = glob(["*.java"]), assets = [ - "//mediapipe/graphs/face_detection:mobile_gpu.binarypb", - "//mediapipe/models:face_detection_front.tflite", - "//mediapipe/models:face_detection_front_labelmap.txt", + "//mediapipe/graphs/face_detection:face_detection_mobile_gpu.binarypb", + "//mediapipe/modules/face_detection:face_detection_front.tflite", ], assets_dir = "", manifest = "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/basic:AndroidManifest.xml", @@ -47,7 +46,7 @@ android_binary( "appName": "Face Detection", "mainActivity": "com.google.mediapipe.apps.basic.MainActivity", "cameraFacingFront": "True", - "binaryGraphName": "mobile_gpu.binarypb", + "binaryGraphName": "face_detection_mobile_gpu.binarypb", "inputVideoStreamName": "input_video", "outputVideoStreamName": "output_video", "flipFramesVertically": "True", diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handdetectiongpu/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handdetectiongpu/BUILD index d7841b6fac..51bc74a33e 100644 --- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handdetectiongpu/BUILD +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handdetectiongpu/BUILD @@ -37,8 +37,7 @@ android_binary( srcs = glob(["*.java"]), assets = [ "//mediapipe/graphs/hand_tracking:hand_detection_mobile_gpu.binarypb", - "//mediapipe/models:palm_detection.tflite", - "//mediapipe/models:palm_detection_labelmap.txt", + "//mediapipe/modules/palm_detection:palm_detection.tflite", ], assets_dir = "", manifest = "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/basic:AndroidManifest.xml", diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/BUILD index 546ce9aa08..afe1c6777e 100644 --- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/BUILD +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/BUILD @@ -37,10 +37,9 @@ android_binary( srcs = glob(["*.java"]), assets = [ "//mediapipe/graphs/hand_tracking:hand_tracking_mobile_gpu.binarypb", - "//mediapipe/models:handedness.txt", - "//mediapipe/models:hand_landmark.tflite", - "//mediapipe/models:palm_detection.tflite", - "//mediapipe/models:palm_detection_labelmap.txt", + "//mediapipe/modules/hand_landmark:handedness.txt", + "//mediapipe/modules/hand_landmark:hand_landmark.tflite", + "//mediapipe/modules/palm_detection:palm_detection.tflite", ], assets_dir = "", manifest = "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/basic:AndroidManifest.xml", diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/MainActivity.java b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/MainActivity.java index e45510c1c7..445431bc4f 100644 --- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/MainActivity.java +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/MainActivity.java @@ -18,76 +18,75 @@ import android.util.Log; import com.google.mediapipe.formats.proto.LandmarkProto.NormalizedLandmark; import com.google.mediapipe.formats.proto.LandmarkProto.NormalizedLandmarkList; +import com.google.mediapipe.framework.AndroidPacketCreator; +import com.google.mediapipe.framework.Packet; import com.google.mediapipe.framework.PacketGetter; -import com.google.protobuf.InvalidProtocolBufferException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** Main activity of MediaPipe hand tracking app. */ public class MainActivity extends com.google.mediapipe.apps.basic.MainActivity { private static final String TAG = "MainActivity"; - private static final String OUTPUT_HAND_PRESENCE_STREAM_NAME = "hand_presence"; + private static final String INPUT_NUM_HANDS_SIDE_PACKET_NAME = "num_hands"; private static final String OUTPUT_LANDMARKS_STREAM_NAME = "hand_landmarks"; + // Max number of hands to detect/process. + private static final int NUM_HANDS = 2; @Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); - processor.addPacketCallback( - OUTPUT_HAND_PRESENCE_STREAM_NAME, - (packet) -> { - Boolean handPresence = PacketGetter.getBool(packet); - if (!handPresence) { - Log.d( - TAG, - "[TS:" + packet.getTimestamp() + "] Hand presence is false, no hands detected."); - } - }); + AndroidPacketCreator packetCreator = processor.getPacketCreator(); + Map inputSidePackets = new HashMap<>(); + inputSidePackets.put(INPUT_NUM_HANDS_SIDE_PACKET_NAME, packetCreator.createInt32(NUM_HANDS)); + processor.setInputSidePackets(inputSidePackets); // To show verbose logging, run: // adb shell setprop log.tag.MainActivity VERBOSE if (Log.isLoggable(TAG, Log.VERBOSE)) { processor.addPacketCallback( - OUTPUT_LANDMARKS_STREAM_NAME, - (packet) -> { - byte[] landmarksRaw = PacketGetter.getProtoBytes(packet); - try { - NormalizedLandmarkList landmarks = NormalizedLandmarkList.parseFrom(landmarksRaw); - if (landmarks == null) { - Log.v(TAG, "[TS:" + packet.getTimestamp() + "] No hand landmarks."); - return; - } - // Note: If hand_presence is false, these landmarks are useless. + OUTPUT_LANDMARKS_STREAM_NAME, + (packet) -> { + Log.v(TAG, "Received multi-hand landmarks packet."); + List multiHandLandmarks = + PacketGetter.getProtoVector(packet, NormalizedLandmarkList.parser()); Log.v( TAG, "[TS:" + packet.getTimestamp() - + "] #Landmarks for hand: " - + landmarks.getLandmarkCount()); - Log.v(TAG, getLandmarksDebugString(landmarks)); - } catch (InvalidProtocolBufferException e) { - Log.e(TAG, "Couldn't Exception received - " + e); - return; - } - }); + + "] " + + getMultiHandLandmarksDebugString(multiHandLandmarks)); + }); } } - private static String getLandmarksDebugString(NormalizedLandmarkList landmarks) { - int landmarkIndex = 0; - String landmarksString = ""; - for (NormalizedLandmark landmark : landmarks.getLandmarkList()) { - landmarksString += - "\t\tLandmark[" - + landmarkIndex - + "]: (" - + landmark.getX() - + ", " - + landmark.getY() - + ", " - + landmark.getZ() - + ")\n"; - ++landmarkIndex; + private String getMultiHandLandmarksDebugString(List multiHandLandmarks) { + if (multiHandLandmarks.isEmpty()) { + return "No hand landmarks"; + } + String multiHandLandmarksStr = "Number of hands detected: " + multiHandLandmarks.size() + "\n"; + int handIndex = 0; + for (NormalizedLandmarkList landmarks : multiHandLandmarks) { + multiHandLandmarksStr += + "\t#Hand landmarks for hand[" + handIndex + "]: " + landmarks.getLandmarkCount() + "\n"; + int landmarkIndex = 0; + for (NormalizedLandmark landmark : landmarks.getLandmarkList()) { + multiHandLandmarksStr += + "\t\tLandmark [" + + landmarkIndex + + "]: (" + + landmark.getX() + + ", " + + landmark.getY() + + ", " + + landmark.getZ() + + ")\n"; + ++landmarkIndex; + } + ++handIndex; } - return landmarksString; + return multiHandLandmarksStr; } } diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/multihandtrackinggpu/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/multihandtrackinggpu/BUILD deleted file mode 100644 index 7d4d7418c7..0000000000 --- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/multihandtrackinggpu/BUILD +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright 2019 The MediaPipe Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -licenses(["notice"]) - -package(default_visibility = ["//visibility:private"]) - -cc_binary( - name = "libmediapipe_jni.so", - linkshared = 1, - linkstatic = 1, - deps = [ - "//mediapipe/graphs/hand_tracking:multi_hand_mobile_calculators", - "//mediapipe/java/com/google/mediapipe/framework/jni:mediapipe_framework_jni", - ], -) - -cc_library( - name = "mediapipe_jni_lib", - srcs = [":libmediapipe_jni.so"], - alwayslink = 1, -) - -android_binary( - name = "multihandtrackinggpu", - srcs = glob(["*.java"]), - assets = [ - "//mediapipe/graphs/hand_tracking:multi_hand_tracking_mobile_gpu.binarypb", - "//mediapipe/models:handedness.txt", - "//mediapipe/models:hand_landmark.tflite", - "//mediapipe/models:palm_detection.tflite", - "//mediapipe/models:palm_detection_labelmap.txt", - ], - assets_dir = "", - manifest = "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/basic:AndroidManifest.xml", - manifest_values = { - "applicationId": "com.google.mediapipe.apps.multihandtrackinggpu", - "appName": "Multi-hand Tracking", - "mainActivity": ".MainActivity", - "cameraFacingFront": "True", - "binaryGraphName": "multi_hand_tracking_mobile_gpu.binarypb", - "inputVideoStreamName": "input_video", - "outputVideoStreamName": "output_video", - "flipFramesVertically": "True", - }, - multidex = "native", - deps = [ - ":mediapipe_jni_lib", - "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/basic:basic_lib", - "//mediapipe/framework/formats:landmark_java_proto_lite", - "//mediapipe/java/com/google/mediapipe/framework:android_framework", - ], -) diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/multihandtrackinggpu/MainActivity.java b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/multihandtrackinggpu/MainActivity.java deleted file mode 100644 index 0d4dfde7f7..0000000000 --- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/multihandtrackinggpu/MainActivity.java +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2019 The MediaPipe Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package com.google.mediapipe.apps.multihandtrackinggpu; - -import android.os.Bundle; -import android.util.Log; -import com.google.mediapipe.formats.proto.LandmarkProto.NormalizedLandmark; -import com.google.mediapipe.formats.proto.LandmarkProto.NormalizedLandmarkList; -import com.google.mediapipe.framework.PacketGetter; -import java.util.List; - -/** Main activity of MediaPipe multi-hand tracking app. */ -public class MainActivity extends com.google.mediapipe.apps.basic.MainActivity { - private static final String TAG = "MainActivity"; - - private static final String OUTPUT_LANDMARKS_STREAM_NAME = "multi_hand_landmarks"; - - @Override - protected void onCreate(Bundle savedInstanceState) { - super.onCreate(savedInstanceState); - - // To show verbose logging, run: - // adb shell setprop log.tag.MainActivity VERBOSE - if (Log.isLoggable(TAG, Log.VERBOSE)) { - processor.addPacketCallback( - OUTPUT_LANDMARKS_STREAM_NAME, - (packet) -> { - Log.v(TAG, "Received multi-hand landmarks packet."); - List multiHandLandmarks = - PacketGetter.getProtoVector(packet, NormalizedLandmarkList.parser()); - Log.v( - TAG, - "[TS:" - + packet.getTimestamp() - + "] " - + getMultiHandLandmarksDebugString(multiHandLandmarks)); - }); - } - } - - private String getMultiHandLandmarksDebugString(List multiHandLandmarks) { - if (multiHandLandmarks.isEmpty()) { - return "No hand landmarks"; - } - String multiHandLandmarksStr = "Number of hands detected: " + multiHandLandmarks.size() + "\n"; - int handIndex = 0; - for (NormalizedLandmarkList landmarks : multiHandLandmarks) { - multiHandLandmarksStr += - "\t#Hand landmarks for hand[" + handIndex + "]: " + landmarks.getLandmarkCount() + "\n"; - int landmarkIndex = 0; - for (NormalizedLandmark landmark : landmarks.getLandmarkList()) { - multiHandLandmarksStr += - "\t\tLandmark [" - + landmarkIndex - + "]: (" - + landmark.getX() - + ", " - + landmark.getY() - + ", " - + landmark.getZ() - + ")\n"; - ++landmarkIndex; - } - ++handIndex; - } - return multiHandLandmarksStr; - } -} diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/BUILD index f07bc8ebca..a8114b3f82 100644 --- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/BUILD +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/BUILD @@ -1,4 +1,4 @@ -# Copyright 2019 The MediaPipe Authors. +# Copyright 2020 The MediaPipe Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,16 +12,64 @@ # See the License for the specific language governing permissions and # limitations under the License. +load("@bazel_skylib//lib:selects.bzl", "selects") +load(":build_defs.bzl", "generate_manifest_values") + licenses(["notice"]) package(default_visibility = ["//visibility:private"]) +config_setting( + name = "use_chair", + define_values = { + "chair": "true", + }, +) + +config_setting( + name = "use_cup", + define_values = { + "cup": "true", + }, +) + +config_setting( + name = "use_camera", + define_values = { + "camera": "true", + }, +) + +config_setting( + name = "use_shoe_1stage", + define_values = { + "shoe_1stage": "true", + }, +) + +config_setting( + name = "use_chair_1stage", + define_values = { + "chair_1stage": "true", + }, +) + +selects.config_setting_group( + name = "1stage", + match_any = [ + ":use_shoe_1stage", + ":use_chair_1stage", + ], +) + cc_binary( name = "libmediapipe_jni.so", linkshared = 1, linkstatic = 1, - deps = [ - "//mediapipe/graphs/object_detection_3d:mobile_calculators", + deps = select({ + "//conditions:default": ["//mediapipe/graphs/object_detection_3d:mobile_calculators"], + ":1stage": ["//mediapipe/graphs/object_detection_3d:mobile_calculators_1stage"], + }) + [ "//mediapipe/java/com/google/mediapipe/framework/jni:mediapipe_framework_jni", ], ) @@ -32,67 +80,108 @@ cc_library( alwayslink = 1, ) -# To use the "chair" model instead of the default "shoes" model, -# add "--define chair=true" to the bazel build command. -config_setting( - name = "use_chair_model", - define_values = { - "chair": "true", - }, -) - genrule( name = "binary_graph", srcs = select({ - "//conditions:default": ["//mediapipe/graphs/object_detection_3d:mobile_gpu_binary_graph_shoe"], - ":use_chair_model": ["//mediapipe/graphs/object_detection_3d:mobile_gpu_binary_graph_chair"], + "//conditions:default": ["//mediapipe/graphs/object_detection_3d:mobile_gpu_binary_graph"], + ":1stage": ["//mediapipe/graphs/object_detection_3d:mobile_gpu_1stage_binary_graph"], }), outs = ["object_detection_3d.binarypb"], cmd = "cp $< $@", ) +MODELS_DIR = "//mediapipe/models" + genrule( name = "model", srcs = select({ - "//conditions:default": ["//mediapipe/models:object_detection_3d_sneakers.tflite"], - ":use_chair_model": ["//mediapipe/models:object_detection_3d_chair.tflite"], + "//conditions:default": [MODELS_DIR + ":object_detection_3d_sneakers.tflite"], + ":use_chair": [MODELS_DIR + ":object_detection_3d_chair.tflite"], + ":use_cup": [MODELS_DIR + ":object_detection_3d_cup.tflite"], + ":use_camera": [MODELS_DIR + ":object_detection_3d_camera.tflite"], + ":use_shoe_1stage": [MODELS_DIR + ":object_detection_3d_sneakers_1stage.tflite"], + ":use_chair_1stage": [MODELS_DIR + ":object_detection_3d_chair_1stage.tflite"], }), outs = ["object_detection_3d.tflite"], cmd = "cp $< $@", ) +MANIFESTS_DIR = "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests" + +android_library( + name = "manifest_lib", + exports_manifest = 1, + manifest = select({ + "//conditions:default": MANIFESTS_DIR + ":AndroidManifestSneaker.xml", + ":use_chair": MANIFESTS_DIR + ":AndroidManifestChair.xml", + ":use_cup": MANIFESTS_DIR + ":AndroidManifestCup.xml", + ":use_camera": MANIFESTS_DIR + ":AndroidManifestCamera.xml", + ":use_shoe_1stage": MANIFESTS_DIR + ":AndroidManifestSneaker.xml", + ":use_chair_1stage": MANIFESTS_DIR + ":AndroidManifestChair.xml", + }), + deps = [ + "//third_party:opencv", + "@maven//:androidx_concurrent_concurrent_futures", + "@maven//:com_google_guava_guava", + ], +) + +ASSETS_DIR = "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets" + +genrule( + name = "mesh", + srcs = select({ + "//conditions:default": [ASSETS_DIR + "/sneaker:model.obj.uuu"], + ":use_chair": [ASSETS_DIR + "/chair:model.obj.uuu"], + ":use_cup": [ASSETS_DIR + "/cup:model.obj.uuu"], + ":use_camera": [ASSETS_DIR + "/camera:model.obj.uuu"], + ":use_shoe_1stage": [ASSETS_DIR + "/sneaker:model.obj.uuu"], + ":use_chair_1stage": [ASSETS_DIR + "/chair:model.obj.uuu"], + }), + outs = ["model.obj.uuu"], + cmd = "cp $< $@", +) + +genrule( + name = "texture", + srcs = select({ + "//conditions:default": [ASSETS_DIR + "/sneaker:texture.jpg"], + ":use_chair": [ASSETS_DIR + "/chair:texture.jpg"], + ":use_cup": [ASSETS_DIR + "/cup:texture.jpg"], + ":use_camera": [ASSETS_DIR + "/camera:texture.jpg"], + ":use_shoe_1stage": [ASSETS_DIR + "/sneaker:texture.jpg"], + ":use_chair_1stage": [ASSETS_DIR + "/chair:texture.jpg"], + }), + outs = ["texture.jpg"], + cmd = "cp $< $@", +) + android_binary( name = "objectdetection3d", srcs = glob(["*.java"]), assets = [ ":binary_graph", ":model", - "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets:box.obj.uuu", - "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets:classic_colors.png", - ] + select({ - "//conditions:default": [ - "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker:model.obj.uuu", - "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker:texture.jpg", - ], - ":use_chair_model": [ - "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair:model.obj.uuu", - "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair:texture.jpg", - ], - }), + ":mesh", + ":texture", + MODELS_DIR + ":object_detection_ssd_mobilenetv2_oidv4_fp16.tflite", + MODELS_DIR + ":object_detection_oidv4_labelmap.pbtxt", + ASSETS_DIR + ":box.obj.uuu", + ASSETS_DIR + ":classic_colors.png", + ], assets_dir = "", manifest = "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/basic:AndroidManifest.xml", - manifest_values = { - "applicationId": "com.google.mediapipe.apps.objectdetection3d", - "appName": "Objectron", - "mainActivity": ".MainActivity", - "cameraFacingFront": "False", - "binaryGraphName": "object_detection_3d.binarypb", - "inputVideoStreamName": "input_video", - "outputVideoStreamName": "output_video", - "flipFramesVertically": "True", - }, + manifest_values = select({ + "//conditions:default": generate_manifest_values("com.google.mediapipe.apps.objectdetection3d_shoe", "Shoe Objectron"), + ":use_chair": generate_manifest_values("com.google.mediapipe.apps.objectdetection3d_chair", "Chair Objectron"), + ":use_cup": generate_manifest_values("com.google.mediapipe.apps.objectdetection3d_cup", "Cup Objectron"), + ":use_camera": generate_manifest_values("com.google.mediapipe.apps.objectdetection3d_camera", "Camera Objectron"), + ":use_shoe_1stage": generate_manifest_values("com.google.mediapipe.apps.objectdetection3d_shoe_1stage", "Single Stage Shoe Objectron"), + ":use_chair_1stage": generate_manifest_values("com.google.mediapipe.apps.objectdetection3d_chair_1stage", "Single Stage Chair Objectron"), + }), multidex = "native", deps = [ + ":manifest_lib", ":mediapipe_jni_lib", "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/basic:basic_lib", "//mediapipe/framework/formats:landmark_java_proto_lite", diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/MainActivity.java b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/MainActivity.java index 92f9f55bb1..cda1819f54 100644 --- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/MainActivity.java +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/MainActivity.java @@ -1,4 +1,4 @@ -// Copyright 2019 The MediaPipe Authors. +// Copyright 2020 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,6 +14,9 @@ package com.google.mediapipe.apps.objectdetection3d; +import android.content.pm.ApplicationInfo; +import android.content.pm.PackageManager; +import android.content.pm.PackageManager.NameNotFoundException; import android.graphics.Bitmap; import android.graphics.BitmapFactory; import android.os.Bundle; @@ -40,10 +43,25 @@ public class MainActivity extends com.google.mediapipe.apps.basic.MainActivity { private Bitmap objTexture = null; private Bitmap boxTexture = null; + // ApplicationInfo for retrieving metadata defined in the manifest. + private ApplicationInfo applicationInfo; + @Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); + try { + applicationInfo = + getPackageManager().getApplicationInfo(getPackageName(), PackageManager.GET_META_DATA); + } catch (NameNotFoundException e) { + Log.e(TAG, "Cannot find application info: " + e); + } + + String categoryName = applicationInfo.metaData.getString("categoryName"); + float[] modelScale = parseFloatArrayFromString( + applicationInfo.metaData.getString("modelScale")); + float[] modelTransform = parseFloatArrayFromString( + applicationInfo.metaData.getString("modelTransformation")); prepareDemoAssets(); AndroidPacketCreator packetCreator = processor.getPacketCreator(); Map inputSidePackets = new HashMap<>(); @@ -51,6 +69,9 @@ protected void onCreate(Bundle savedInstanceState) { inputSidePackets.put("box_asset_name", packetCreator.createString(BOX_FILE)); inputSidePackets.put("obj_texture", packetCreator.createRgbaImageFrame(objTexture)); inputSidePackets.put("box_texture", packetCreator.createRgbaImageFrame(boxTexture)); + inputSidePackets.put("allowed_labels", packetCreator.createString(categoryName)); + inputSidePackets.put("model_scale", packetCreator.createFloat32Array(modelScale)); + inputSidePackets.put("model_transformation", packetCreator.createFloat32Array(modelTransform)); processor.setInputSidePackets(inputSidePackets); } @@ -134,4 +155,13 @@ private void prepareDemoAssets() { throw new RuntimeException(e); } } + + private static float[] parseFloatArrayFromString(String string) { + String[] elements = string.split(",", -1); + float[] array = new float[elements.length]; + for (int i = 0; i < elements.length; ++i) { + array[i] = Float.parseFloat(elements[i]); + } + return array; + } } diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/BUILD index 46d1640403..a8bb9124c3 100644 --- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/BUILD +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/BUILD @@ -1,4 +1,4 @@ -# Copyright 2019 The MediaPipe Authors. +# Copyright 2020 The MediaPipe Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/mediapipe/examples/python/__init__.py b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/camera/BUILD similarity index 83% rename from mediapipe/examples/python/__init__.py rename to mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/camera/BUILD index 5d9133833d..a8bb9124c3 100644 --- a/mediapipe/examples/python/__init__.py +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/camera/BUILD @@ -11,6 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""MediaPipe Python Examples.""" -from mediapipe.examples.python.upper_body_pose_tracker import UpperBodyPoseTracker +licenses(["notice"]) + +package(default_visibility = ["//visibility:public"]) + +exports_files( + srcs = glob(["**"]), +) diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/camera/model.obj.uuu b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/camera/model.obj.uuu new file mode 100644 index 0000000000..0280d5dd09 Binary files /dev/null and b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/camera/model.obj.uuu differ diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/camera/texture.jpg b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/camera/texture.jpg new file mode 100644 index 0000000000..4a19534ddd Binary files /dev/null and b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/camera/texture.jpg differ diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/BUILD index 46d1640403..a8bb9124c3 100644 --- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/BUILD +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/BUILD @@ -1,4 +1,4 @@ -# Copyright 2019 The MediaPipe Authors. +# Copyright 2020 The MediaPipe Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/cup/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/cup/BUILD new file mode 100644 index 0000000000..a8bb9124c3 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/cup/BUILD @@ -0,0 +1,21 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +licenses(["notice"]) + +package(default_visibility = ["//visibility:public"]) + +exports_files( + srcs = glob(["**"]), +) diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/cup/model.obj.uuu b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/cup/model.obj.uuu new file mode 100644 index 0000000000..167e134eb7 Binary files /dev/null and b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/cup/model.obj.uuu differ diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/cup/texture.jpg b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/cup/texture.jpg new file mode 100644 index 0000000000..f3aea35682 Binary files /dev/null and b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/cup/texture.jpg differ diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/BUILD index 46d1640403..a8bb9124c3 100644 --- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/BUILD +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/BUILD @@ -1,4 +1,4 @@ -# Copyright 2019 The MediaPipe Authors. +# Copyright 2020 The MediaPipe Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/build_defs.bzl b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/build_defs.bzl new file mode 100644 index 0000000000..85a2a76aed --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/build_defs.bzl @@ -0,0 +1,14 @@ +"""Build defs for Objectron.""" + +def generate_manifest_values(application_id, app_name): + manifest_values = { + "applicationId": application_id, + "appName": app_name, + "mainActivity": "com.google.mediapipe.apps.objectdetection3d.MainActivity", + "cameraFacingFront": "False", + "binaryGraphName": "object_detection_3d.binarypb", + "inputVideoStreamName": "input_video", + "outputVideoStreamName": "output_video", + "flipFramesVertically": "True", + } + return manifest_values diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/AndroidManifestCamera.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/AndroidManifestCamera.xml new file mode 100644 index 0000000000..4c4a5b930d --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/AndroidManifestCamera.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/AndroidManifestChair.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/AndroidManifestChair.xml new file mode 100644 index 0000000000..71dfe74089 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/AndroidManifestChair.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/AndroidManifestCup.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/AndroidManifestCup.xml new file mode 100644 index 0000000000..36a3973de4 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/AndroidManifestCup.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/AndroidManifestSneaker.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/AndroidManifestSneaker.xml new file mode 100644 index 0000000000..5e695e9d72 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/AndroidManifestSneaker.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/BUILD new file mode 100644 index 0000000000..a8bb9124c3 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/manifests/BUILD @@ -0,0 +1,21 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +licenses(["notice"]) + +package(default_visibility = ["//visibility:public"]) + +exports_files( + srcs = glob(["**"]), +) diff --git a/mediapipe/examples/coral/BUILD b/mediapipe/examples/coral/BUILD index 03d4027e7f..ec747573bd 100644 --- a/mediapipe/examples/coral/BUILD +++ b/mediapipe/examples/coral/BUILD @@ -51,6 +51,6 @@ cc_binary( name = "face_detection_tpu", deps = [ "//mediapipe/examples/coral:demo_run_graph_main", - "//mediapipe/graphs/face_detection:desktop_tflite_calculators", + "//mediapipe/graphs/face_detection:desktop_live_calculators", ], ) diff --git a/mediapipe/examples/desktop/autoflip/subgraph/BUILD b/mediapipe/examples/desktop/autoflip/subgraph/BUILD index 9af7e447b8..6c3e2616cf 100644 --- a/mediapipe/examples/desktop/autoflip/subgraph/BUILD +++ b/mediapipe/examples/desktop/autoflip/subgraph/BUILD @@ -18,14 +18,23 @@ licenses(["notice"]) package(default_visibility = ["//mediapipe/examples:__subpackages__"]) +FACE_DETECTION_DEPS = [ + "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/tflite:ssd_anchors_calculator", + "//mediapipe/calculators/tflite:tflite_converter_calculator", + "//mediapipe/calculators/tflite:tflite_inference_calculator", + "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", + "//mediapipe/calculators/util:detection_label_id_to_text_calculator", + "//mediapipe/calculators/util:detection_letterbox_removal_calculator", + "//mediapipe/calculators/util:non_max_suppression_calculator", +] + mediapipe_simple_subgraph( name = "autoflip_face_detection_subgraph", graph = "face_detection_subgraph.pbtxt", register_as = "AutoFlipFaceDetectionSubgraph", visibility = ["//visibility:public"], - deps = [ - "//mediapipe/graphs/face_detection:desktop_tflite_calculators", - ], + deps = FACE_DETECTION_DEPS, ) mediapipe_simple_subgraph( @@ -33,16 +42,7 @@ mediapipe_simple_subgraph( graph = "front_face_detection_subgraph.pbtxt", register_as = "AutoFlipFrontFaceDetectionSubgraph", visibility = ["//visibility:public"], - deps = [ - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:ssd_anchors_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", - "//mediapipe/calculators/util:detection_label_id_to_text_calculator", - "//mediapipe/calculators/util:detection_letterbox_removal_calculator", - "//mediapipe/calculators/util:non_max_suppression_calculator", - ], + deps = FACE_DETECTION_DEPS, ) mediapipe_simple_subgraph( diff --git a/mediapipe/examples/desktop/face_detection/BUILD b/mediapipe/examples/desktop/face_detection/BUILD index 55c9eb7414..5743ae788d 100644 --- a/mediapipe/examples/desktop/face_detection/BUILD +++ b/mediapipe/examples/desktop/face_detection/BUILD @@ -20,7 +20,7 @@ cc_binary( name = "face_detection_cpu", deps = [ "//mediapipe/examples/desktop:demo_run_graph_main", - "//mediapipe/graphs/face_detection:desktop_tflite_calculators", + "//mediapipe/graphs/face_detection:desktop_live_calculators", ], ) @@ -29,6 +29,6 @@ cc_binary( name = "face_detection_gpu", deps = [ "//mediapipe/examples/desktop:demo_run_graph_main_gpu", - "//mediapipe/graphs/face_detection:mobile_calculators", + "//mediapipe/graphs/face_detection:desktop_live_gpu_calculators", ], ) diff --git a/mediapipe/examples/desktop/multi_hand_tracking/BUILD b/mediapipe/examples/desktop/multi_hand_tracking/BUILD deleted file mode 100644 index a7bd112ffb..0000000000 --- a/mediapipe/examples/desktop/multi_hand_tracking/BUILD +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2019 The MediaPipe Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -licenses(["notice"]) - -package(default_visibility = ["//mediapipe/examples:__subpackages__"]) - -cc_binary( - name = "multi_hand_tracking_tflite", - deps = [ - "//mediapipe/examples/desktop:simple_run_graph_main", - "//mediapipe/graphs/hand_tracking:multi_hand_desktop_tflite_calculators", - ], -) - -cc_binary( - name = "multi_hand_tracking_cpu", - deps = [ - "//mediapipe/examples/desktop:demo_run_graph_main", - "//mediapipe/graphs/hand_tracking:multi_hand_desktop_tflite_calculators", - ], -) - -# Linux only -cc_binary( - name = "multi_hand_tracking_gpu", - deps = [ - "//mediapipe/examples/desktop:demo_run_graph_main_gpu", - "//mediapipe/graphs/hand_tracking:multi_hand_mobile_calculators", - ], -) diff --git a/mediapipe/examples/ios/facedetectioncpu/BUILD b/mediapipe/examples/ios/facedetectioncpu/BUILD index a4ae2cfcad..43bff9b1eb 100644 --- a/mediapipe/examples/ios/facedetectioncpu/BUILD +++ b/mediapipe/examples/ios/facedetectioncpu/BUILD @@ -54,9 +54,8 @@ ios_application( objc_library( name = "FaceDetectionCpuAppLibrary", data = [ - "//mediapipe/graphs/face_detection:mobile_cpu_binary_graph", - "//mediapipe/models:face_detection_front.tflite", - "//mediapipe/models:face_detection_front_labelmap.txt", + "//mediapipe/graphs/face_detection:face_detection_mobile_cpu.binarypb", + "//mediapipe/modules/face_detection:face_detection_front.tflite", ], deps = [ "//mediapipe/examples/ios/common:CommonMediaPipeAppLibrary", diff --git a/mediapipe/examples/ios/facedetectioncpu/Info.plist b/mediapipe/examples/ios/facedetectioncpu/Info.plist index d1738a5c77..34e1a7eee2 100644 --- a/mediapipe/examples/ios/facedetectioncpu/Info.plist +++ b/mediapipe/examples/ios/facedetectioncpu/Info.plist @@ -9,6 +9,6 @@ GraphInputStream input_video GraphName - mobile_cpu + face_detection_mobile_cpu diff --git a/mediapipe/examples/ios/facedetectiongpu/BUILD b/mediapipe/examples/ios/facedetectiongpu/BUILD index 507ac45d87..51856a7f7a 100644 --- a/mediapipe/examples/ios/facedetectiongpu/BUILD +++ b/mediapipe/examples/ios/facedetectiongpu/BUILD @@ -54,9 +54,8 @@ ios_application( objc_library( name = "FaceDetectionGpuAppLibrary", data = [ - "//mediapipe/graphs/face_detection:mobile_gpu_binary_graph", - "//mediapipe/models:face_detection_front.tflite", - "//mediapipe/models:face_detection_front_labelmap.txt", + "//mediapipe/graphs/face_detection:face_detection_mobile_gpu.binarypb", + "//mediapipe/modules/face_detection:face_detection_front.tflite", ], deps = [ "//mediapipe/examples/ios/common:CommonMediaPipeAppLibrary", diff --git a/mediapipe/examples/ios/facedetectiongpu/Info.plist b/mediapipe/examples/ios/facedetectiongpu/Info.plist index 6b47907348..45feefb452 100644 --- a/mediapipe/examples/ios/facedetectiongpu/Info.plist +++ b/mediapipe/examples/ios/facedetectiongpu/Info.plist @@ -9,6 +9,6 @@ GraphInputStream input_video GraphName - mobile_gpu + face_detection_mobile_gpu diff --git a/mediapipe/examples/ios/faceeffect/BUILD b/mediapipe/examples/ios/faceeffect/BUILD index 0437c0dc92..271dcfa885 100644 --- a/mediapipe/examples/ios/faceeffect/BUILD +++ b/mediapipe/examples/ios/faceeffect/BUILD @@ -34,7 +34,7 @@ alias( ios_application( name = "FaceEffectApp", app_icons = ["//mediapipe/examples/ios/common:AppIcon"], - bundle_id = BUNDLE_ID_PREFIX + ".FaceMeshGpu", + bundle_id = BUNDLE_ID_PREFIX + ".FaceEffectGpu", families = [ "iphone", "ipad", diff --git a/mediapipe/examples/ios/facemeshgpu/BUILD b/mediapipe/examples/ios/facemeshgpu/BUILD index 11bd649bfe..5a7f92e1e2 100644 --- a/mediapipe/examples/ios/facemeshgpu/BUILD +++ b/mediapipe/examples/ios/facemeshgpu/BUILD @@ -60,7 +60,7 @@ objc_library( "FaceMeshGpuViewController.h", ], data = [ - "//mediapipe/graphs/face_mesh:face_mesh_mobile_gpu_binary_graph", + "//mediapipe/graphs/face_mesh:face_mesh_mobile_gpu.binarypb", "//mediapipe/modules/face_detection:face_detection_front.tflite", "//mediapipe/modules/face_landmark:face_landmark.tflite", ], diff --git a/mediapipe/examples/ios/handdetectiongpu/BUILD b/mediapipe/examples/ios/handdetectiongpu/BUILD index e1fbb8bd6f..1b5ed9820e 100644 --- a/mediapipe/examples/ios/handdetectiongpu/BUILD +++ b/mediapipe/examples/ios/handdetectiongpu/BUILD @@ -55,8 +55,7 @@ objc_library( name = "HandDetectionGpuAppLibrary", data = [ "//mediapipe/graphs/hand_tracking:hand_detection_mobile_gpu_binary_graph", - "//mediapipe/models:palm_detection.tflite", - "//mediapipe/models:palm_detection_labelmap.txt", + "//mediapipe/modules/palm_detection:palm_detection.tflite", ], deps = [ "//mediapipe/examples/ios/common:CommonMediaPipeAppLibrary", diff --git a/mediapipe/examples/ios/handtrackinggpu/BUILD b/mediapipe/examples/ios/handtrackinggpu/BUILD index b3ac999b6b..ed732f8fbf 100644 --- a/mediapipe/examples/ios/handtrackinggpu/BUILD +++ b/mediapipe/examples/ios/handtrackinggpu/BUILD @@ -60,11 +60,10 @@ objc_library( "HandTrackingViewController.h", ], data = [ - "//mediapipe/graphs/hand_tracking:hand_tracking_mobile_gpu_binary_graph", - "//mediapipe/models:hand_landmark.tflite", - "//mediapipe/models:handedness.txt", - "//mediapipe/models:palm_detection.tflite", - "//mediapipe/models:palm_detection_labelmap.txt", + "//mediapipe/graphs/hand_tracking:hand_tracking_mobile_gpu.binarypb", + "//mediapipe/modules/hand_landmark:hand_landmark.tflite", + "//mediapipe/modules/hand_landmark:handedness.txt", + "//mediapipe/modules/palm_detection:palm_detection.tflite", ], deps = [ "//mediapipe/examples/ios/common:CommonMediaPipeAppLibrary", diff --git a/mediapipe/examples/ios/handtrackinggpu/HandTrackingViewController.mm b/mediapipe/examples/ios/handtrackinggpu/HandTrackingViewController.mm index 491d654597..87e562d01f 100644 --- a/mediapipe/examples/ios/handtrackinggpu/HandTrackingViewController.mm +++ b/mediapipe/examples/ios/handtrackinggpu/HandTrackingViewController.mm @@ -17,6 +17,10 @@ #include "mediapipe/framework/formats/landmark.pb.h" static const char* kLandmarksOutputStream = "hand_landmarks"; +static const char* kNumHandsInputSidePacket = "num_hands"; + +// Max number of hands to detect/process. +static const int kNumHands = 2; @implementation HandTrackingViewController @@ -25,6 +29,8 @@ @implementation HandTrackingViewController - (void)viewDidLoad { [super viewDidLoad]; + [self.mediapipeGraph setSidePacket:(mediapipe::MakePacket(kNumHands)) + named:kNumHandsInputSidePacket]; [self.mediapipeGraph addFrameOutputStream:kLandmarksOutputStream outputPacketType:MPPPacketTypeRaw]; } @@ -40,12 +46,16 @@ - (void)mediapipeGraph:(MPPGraph*)graph NSLog(@"[TS:%lld] No hand landmarks", packet.Timestamp().Value()); return; } - const auto& landmarks = packet.Get<::mediapipe::NormalizedLandmarkList>(); - NSLog(@"[TS:%lld] Number of landmarks on hand: %d", packet.Timestamp().Value(), - landmarks.landmark_size()); - for (int i = 0; i < landmarks.landmark_size(); ++i) { - NSLog(@"\tLandmark[%d]: (%f, %f, %f)", i, landmarks.landmark(i).x(), - landmarks.landmark(i).y(), landmarks.landmark(i).z()); + const auto& multiHandLandmarks = packet.Get>(); + NSLog(@"[TS:%lld] Number of hand instances with landmarks: %lu", packet.Timestamp().Value(), + multiHandLandmarks.size()); + for (int handIndex = 0; handIndex < multiHandLandmarks.size(); ++handIndex) { + const auto& landmarks = multiHandLandmarks[handIndex]; + NSLog(@"\tNumber of landmarks for hand[%d]: %d", handIndex, landmarks.landmark_size()); + for (int i = 0; i < landmarks.landmark_size(); ++i) { + NSLog(@"\t\tLandmark[%d]: (%f, %f, %f)", i, landmarks.landmark(i).x(), + landmarks.landmark(i).y(), landmarks.landmark(i).z()); + } } } } diff --git a/mediapipe/examples/ios/multihandtrackinggpu/BUILD b/mediapipe/examples/ios/multihandtrackinggpu/BUILD deleted file mode 100644 index 5616f12b6b..0000000000 --- a/mediapipe/examples/ios/multihandtrackinggpu/BUILD +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright 2019 The MediaPipe Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -load( - "@build_bazel_rules_apple//apple:ios.bzl", - "ios_application", -) -load( - "//mediapipe/examples/ios:bundle_id.bzl", - "BUNDLE_ID_PREFIX", - "example_provisioning", -) - -licenses(["notice"]) - -MIN_IOS_VERSION = "10.0" - -alias( - name = "multihandtrackinggpu", - actual = "MultiHandTrackingGpuApp", -) - -ios_application( - name = "MultiHandTrackingGpuApp", - app_icons = ["//mediapipe/examples/ios/common:AppIcon"], - bundle_id = BUNDLE_ID_PREFIX + ".MultiHandTrackingGpu", - families = [ - "iphone", - "ipad", - ], - infoplists = [ - "//mediapipe/examples/ios/common:Info.plist", - "Info.plist", - ], - minimum_os_version = MIN_IOS_VERSION, - provisioning_profile = example_provisioning(), - deps = [ - ":MultiHandTrackingGpuAppLibrary", - "@ios_opencv//:OpencvFramework", - ], -) - -objc_library( - name = "MultiHandTrackingGpuAppLibrary", - srcs = [ - "MultiHandTrackingViewController.mm", - ], - hdrs = [ - "MultiHandTrackingViewController.h", - ], - data = [ - "//mediapipe/graphs/hand_tracking:multi_hand_tracking_mobile_gpu_binary_graph", - "//mediapipe/models:hand_landmark.tflite", - "//mediapipe/models:handedness.txt", - "//mediapipe/models:palm_detection.tflite", - "//mediapipe/models:palm_detection_labelmap.txt", - ], - deps = [ - "//mediapipe/examples/ios/common:CommonMediaPipeAppLibrary", - ] + select({ - "//mediapipe:ios_i386": [], - "//mediapipe:ios_x86_64": [], - "//conditions:default": [ - "//mediapipe/graphs/hand_tracking:multi_hand_mobile_calculators", - "//mediapipe/framework/formats:landmark_cc_proto", - ], - }), -) diff --git a/mediapipe/examples/ios/multihandtrackinggpu/Info.plist b/mediapipe/examples/ios/multihandtrackinggpu/Info.plist deleted file mode 100644 index 46e3fbd3d9..0000000000 --- a/mediapipe/examples/ios/multihandtrackinggpu/Info.plist +++ /dev/null @@ -1,16 +0,0 @@ - - - - - CameraPosition - front - MainViewController - MultiHandTrackingViewController - GraphOutputStream - output_video - GraphInputStream - input_video - GraphName - multi_hand_tracking_mobile_gpu - - diff --git a/mediapipe/examples/ios/multihandtrackinggpu/MultiHandTrackingViewController.mm b/mediapipe/examples/ios/multihandtrackinggpu/MultiHandTrackingViewController.mm deleted file mode 100644 index 6c1deb7da1..0000000000 --- a/mediapipe/examples/ios/multihandtrackinggpu/MultiHandTrackingViewController.mm +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2019 The MediaPipe Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#import "MultiHandTrackingViewController.h" - -#include "mediapipe/framework/formats/landmark.pb.h" - -static const char* kLandmarksOutputStream = "multi_hand_landmarks"; - -@implementation MultiHandTrackingViewController - -#pragma mark - UIViewController methods - -- (void)viewDidLoad { - [super viewDidLoad]; - - [self.mediapipeGraph addFrameOutputStream:kLandmarksOutputStream - outputPacketType:MPPPacketTypeRaw]; -} - -#pragma mark - MPPGraphDelegate methods - -// Receives a raw packet from the MediaPipe graph. Invoked on a MediaPipe worker thread. -- (void)mediapipeGraph:(MPPGraph*)graph - didOutputPacket:(const ::mediapipe::Packet&)packet - fromStream:(const std::string&)streamName { - if (streamName == kLandmarksOutputStream) { - if (packet.IsEmpty()) { - NSLog(@"[TS:%lld] No hand landmarks", packet.Timestamp().Value()); - return; - } - const auto& multi_hand_landmarks = packet.Get>(); - NSLog(@"[TS:%lld] Number of hand instances with landmarks: %lu", packet.Timestamp().Value(), - multi_hand_landmarks.size()); - for (int hand_index = 0; hand_index < multi_hand_landmarks.size(); ++hand_index) { - const auto& landmarks = multi_hand_landmarks[hand_index]; - NSLog(@"\tNumber of landmarks for hand[%d]: %d", hand_index, landmarks.landmark_size()); - for (int i = 0; i < landmarks.landmark_size(); ++i) { - NSLog(@"\t\tLandmark[%d]: (%f, %f, %f)", i, landmarks.landmark(i).x(), - landmarks.landmark(i).y(), landmarks.landmark(i).z()); - } - } - } -} - -@end diff --git a/mediapipe/examples/python/upper_body_pose_tracker.py b/mediapipe/examples/python/upper_body_pose_tracker.py deleted file mode 100644 index 9c1f7dd57f..0000000000 --- a/mediapipe/examples/python/upper_body_pose_tracker.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright 2020 The MediaPipe Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Lint as: python3 -"""MediaPipe upper body pose tracker. - -MediaPipe upper body pose tracker takes an RGB image as the input and returns -a pose landmark list and an annotated RGB image represented as a numpy ndarray. - -Usage examples: - pose_tracker = UpperBodyPoseTracker() - - pose_landmarks, _ = pose_tracker.run( - input_file='/tmp/input.png', - output_file='/tmp/output.png') - - input_image = cv2.imread('/tmp/input.png')[:, :, ::-1] - pose_landmarks, annotated_image = pose_tracker.run(input_image) - - pose_tracker.run_live() - - pose_tracker.close() -""" - -import os -import time -from typing import Tuple, Union - -import cv2 -import mediapipe.python as mp -import numpy as np -# resources dependency -from mediapipe.framework.formats import landmark_pb2 - -# Input and output stream names. -INPUT_VIDEO = 'input_video' -OUTPUT_VIDEO = 'output_video' -POSE_LANDMARKS = 'pose_landmarks' - - -class UpperBodyPoseTracker: - """MediaPipe upper body pose tracker.""" - - def __init__(self): - """The init method of MediaPipe upper body pose tracker. - - The method reads the upper body pose tracking cpu binary graph and - initializes a CalculatorGraph from it. The output packets of pose_landmarks - and output_video output streams will be observed by callbacks. The graph - will be started at the end of this method, waiting for input packets. - """ - # MediaPipe package root path - root_path = os.sep.join( os.path.abspath(__file__).split(os.sep)[:-4]) - mp.resource_util.set_resource_dir(root_path) - - self._graph = mp.CalculatorGraph( - binary_graph_path=os.path.join( - root_path, - 'mediapipe/graphs/pose_tracking/upper_body_pose_tracking_cpu.binarypb' - )) - self._outputs = {} - for stream_name in [POSE_LANDMARKS, OUTPUT_VIDEO]: - self._graph.observe_output_stream(stream_name, self._assign_packet) - self._graph.start_run() - - def run( - self, - input_frame: np.ndarray = None, - *, - input_file: str = None, - output_file: str = None - ) -> Tuple[Union[None, landmark_pb2.NormalizedLandmarkList], np.ndarray]: - """The run method of MediaPipe upper body pose tracker. - - MediaPipe upper body pose tracker takes either the path to an image file or - an RGB image represented as a numpy ndarray and it returns the pose - landmarks list and the annotated RGB image represented as a numpy ndarray. - - Args: - input_frame: An RGB image represented as a numpy ndarray. - input_file: The path to an image file. - output_file: The file path that the annotated image will be saved into. - - Returns: - pose_landmarks: The pose landmarks list. - annotated_image: The image with pose landmarks annotations. - - Raises: - RuntimeError: If the input frame doesn't contain 3 channels (RGB format) - or the input arg is not correctly provided. - - Examples - pose_tracker = UpperBodyPoseTracker() - pose_landmarks, _ = pose_tracker.run( - input_file='/tmp/input.png', - output_file='/tmp/output.png') - - # Read an image and convert the BGR image to RGB. - input_image = cv2.cvtColor(cv2.imread('/tmp/input.png'), COLOR_BGR2RGB) - pose_landmarks, annotated_image = pose_tracker.run(input_image) - pose_tracker.close() - """ - if input_file is None and input_frame is None: - raise RuntimeError( - 'Must provide either a path to an image file or an RGB image represented as a numpy.ndarray.' - ) - - if input_file: - if input_frame is not None: - raise RuntimeError( - 'Must only provide either \'input_file\' or \'input_frame\'.') - else: - input_frame = cv2.imread(input_file)[:, :, ::-1] - - pose_landmarks, annotated_image = self._run_graph(input_frame) - if output_file: - cv2.imwrite(output_file, annotated_image[:, :, ::-1]) - return pose_landmarks, annotated_image - - def run_live(self) -> None: - """Run MediaPipe upper body pose tracker with live camera input. - - The method will be self-terminated after 30 seconds. If you need to - terminate it earlier, press the Esc key to stop the run manually. Note that - you need to select the output image window rather than the terminal window - first and then press the key. - - Examples: - pose_tracker = UpperBodyPoseTracker() - pose_tracker.run_live() - pose_tracker.close() - """ - cap = cv2.VideoCapture(0) - start_time = time.time() - print( - 'Press Esc within the output image window to stop the run, or let it ' - 'self terminate after 30 seconds.') - while cap.isOpened() and time.time() - start_time < 30: - success, input_frame = cap.read() - if not success: - break - input_frame = cv2.cvtColor(cv2.flip(input_frame, 1), cv2.COLOR_BGR2RGB) - input_frame.flags.writeable = False - _, output_frame = self._run_graph(input_frame) - cv2.imshow('MediaPipe upper body pose tracker', - cv2.cvtColor(output_frame, cv2.COLOR_RGB2BGR)) - if cv2.waitKey(5) & 0xFF == 27: - break - cap.release() - cv2.destroyAllWindows() - - def close(self) -> None: - self._graph.close() - self._graph = None - self._outputs = None - - def _run_graph( - self, - input_frame: np.ndarray = None, - ) -> Tuple[Union[None, landmark_pb2.NormalizedLandmarkList], np.ndarray]: - """The internal run graph method. - - Args: - input_frame: An RGB image represented as a numpy ndarray. - - Returns: - pose_landmarks: The pose landmarks list. - annotated_image: The image with pose landmarks annotations. - - Raises: - RuntimeError: If the input frame doesn't contain 3 channels representing - RGB. - """ - - if input_frame.shape[2] != 3: - raise RuntimeError('input frame must have 3 channels.') - - self._outputs.clear() - start_time = time.time() - self._graph.add_packet_to_input_stream( - stream=INPUT_VIDEO, - packet=mp.packet_creator.create_image_frame( - image_format=mp.ImageFormat.SRGB, data=input_frame), - timestamp=mp.Timestamp.from_seconds(start_time)) - self._graph.wait_until_idle() - - pose_landmarks = None - if POSE_LANDMARKS in self._outputs: - pose_landmarks = mp.packet_getter.get_proto(self._outputs[POSE_LANDMARKS]) - annotated_image = mp.packet_getter.get_image_frame( - self._outputs[OUTPUT_VIDEO]).numpy_view() - print('UpperBodyPoseTracker.Run() took', - time.time() - start_time, 'seconds') - return pose_landmarks, annotated_image - - def _assign_packet(self, stream_name: str, packet: mp.Packet) -> None: - self._outputs[stream_name] = packet diff --git a/mediapipe/framework/formats/BUILD b/mediapipe/framework/formats/BUILD index 50774e6de7..dfe432972f 100644 --- a/mediapipe/framework/formats/BUILD +++ b/mediapipe/framework/formats/BUILD @@ -48,6 +48,18 @@ mediapipe_proto_library( visibility = ["//visibility:public"], ) +mediapipe_register_type( + base_name = "classification", + include_headers = ["mediapipe/framework/formats/classification.pb.h"], + types = [ + "::mediapipe::Classification", + "::mediapipe::ClassificationList", + "::std::vector<::mediapipe::Classification>", + "::std::vector<::mediapipe::ClassificationList>", + ], + deps = [":classification_cc_proto"], +) + mediapipe_proto_library( name = "image_format_proto", srcs = ["image_format.proto"], @@ -289,3 +301,51 @@ cc_test( "@com_google_absl//absl/memory", ], ) + +cc_library( + name = "tensor", + srcs = ["tensor.cc"], + hdrs = ["tensor.h"], + copts = select({ + "//mediapipe:apple": [ + "-x objective-c++", + "-fobjc-arc", # enable reference-counting + ], + "//conditions:default": [], + }), + linkopts = select({ + "//mediapipe:ios": [ + "-framework CoreVideo", + "-framework MetalKit", + ], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], + deps = [ + "@com_google_absl//absl/memory", + "@com_google_absl//absl/synchronization", + "//mediapipe/framework:port", + "//mediapipe/framework/port:logging", + ] + select({ + "//mediapipe/gpu:disable_gpu": [], + "//conditions:default": [ + "//mediapipe/gpu:gl_base", + "//mediapipe/gpu:gl_context", + ], + }), +) + +cc_test( + name = "tensor_test", + srcs = ["tensor_test.cc"], + deps = [ + ":tensor", + "//mediapipe/framework/port:gtest_main", + ] + select({ + "//conditions:default": [ + "//mediapipe/gpu:gl_calculator_helper", + "//mediapipe/gpu:gpu_buffer_format", + ], + "//mediapipe/gpu:disable_gpu": [], + }), +) diff --git a/mediapipe/framework/formats/tensor.cc b/mediapipe/framework/formats/tensor.cc new file mode 100644 index 0000000000..985ae07b9a --- /dev/null +++ b/mediapipe/framework/formats/tensor.cc @@ -0,0 +1,431 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/framework/formats/tensor.h" + +#include +#include + +#include "absl/synchronization/mutex.h" +#include "mediapipe/framework/port.h" +#include "mediapipe/framework/port/logging.h" + +#if MEDIAPIPE_METAL_ENABLED +#include +#include +#else +#include +#endif // MEDIAPIPE_METAL_ENABLED + +namespace mediapipe { + +int BhwcBatchFromShape(const Tensor::Shape& shape) { + LOG_IF(FATAL, shape.dims.empty()) + << "Tensor::Shape must be non-empty to retrieve a named dimension"; + return shape.dims[0]; +} + +int BhwcHeightFromShape(const Tensor::Shape& shape) { + LOG_IF(FATAL, shape.dims.empty()) + << "Tensor::Shape must be non-empty to retrieve a named dimension"; + return shape.dims.size() < 4 ? 1 : shape.dims[shape.dims.size() - 3]; +} + +int BhwcWidthFromShape(const Tensor::Shape& shape) { + LOG_IF(FATAL, shape.dims.empty()) + << "Tensor::Shape must be non-empty to retrieve a named dimension"; + return shape.dims.size() < 3 ? 1 : shape.dims[shape.dims.size() - 2]; +} + +int BhwcDepthFromShape(const Tensor::Shape& shape) { + LOG_IF(FATAL, shape.dims.empty()) + << "Tensor::Shape must be non-empty to retrieve a named dimension"; + return shape.dims.size() < 2 ? 1 : shape.dims[shape.dims.size() - 1]; +} + +// TODO: Match channels count and padding for Texture2D: +// 1) support 1/2/4 channesl texture for 1/2/3-4 depth. +// 2) Allocate cpu_buffer_ with padded amount of memory +// 3) pad/"unpad" the bitmap after transfer CPU <-> GPU + +#if MEDIAPIPE_METAL_ENABLED +namespace { +// MTLBuffer can use existing properly aligned and allocated CPU memory. +size_t AlignToPageSize(size_t size) { + auto page_size = getpagesize(); + return (size + page_size - 1) / page_size * page_size; +} + +void* AllocateVirtualMemory(size_t size) { + vm_address_t data; + auto error = vm_allocate(mach_task_self(), &data, AlignToPageSize(size), + VM_FLAGS_ANYWHERE); + LOG_IF(FATAL, error != KERN_SUCCESS) + << "Can't allocate virtual memory for Tensor."; + return reinterpret_cast(data); +} + +void DeallocateVirtualMemory(void* pointer, size_t size) { + vm_deallocate(mach_task_self(), reinterpret_cast(pointer), + size); +} +} // namespace + +Tensor::MtlBufferView Tensor::GetMtlBufferReadView( + id command_buffer) const { + LOG_IF(FATAL, valid_ == kValidNone) + << "Tensor must be written prior to read from."; + LOG_IF(FATAL, !(valid_ & (kValidCpu | kValidMetalBuffer))) + << "Tensor conversion between different GPU resources is not supported " + "yet."; + auto lock(absl::make_unique(&view_mutex_)); + valid_ |= kValidMetalBuffer; + AllocateMtlBuffer([command_buffer device]); + return {metal_buffer_, std::move(lock)}; +} + +Tensor::MtlBufferView Tensor::GetMtlBufferWriteView( + id command_buffer) const { + // Don't overwrite command buffer at which the metal buffer has been written + // so we can wait until completed. + command_buffer_ = command_buffer; + return GetMtlBufferWriteView([command_buffer device]); +} + +Tensor::MtlBufferView Tensor::GetMtlBufferWriteView( + id device) const { + auto lock(absl::make_unique(&view_mutex_)); + valid_ = kValidMetalBuffer; + AllocateMtlBuffer(device); + return {metal_buffer_, std::move(lock)}; +} + +void Tensor::AllocateMtlBuffer(id device) const { + device_ = device; + if (!cpu_buffer_) { + // It also means that the metal buffer is not allocated yet. + cpu_buffer_ = AllocateVirtualMemory(bytes()); + } + if (!metal_buffer_) { + metal_buffer_ = + [device_ newBufferWithBytesNoCopy:cpu_buffer_ + length:AlignToPageSize(bytes()) + options:MTLResourceStorageModeShared | + MTLResourceCPUCacheModeDefaultCache + deallocator:^(void* pointer, NSUInteger length) { + DeallocateVirtualMemory(pointer, length); + }]; + } +} +#endif // MEDIAPIPE_METAL_ENABLED + +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 +Tensor::OpenGlTexture2dView Tensor::GetOpenGlTexture2dReadView() const { + LOG_IF(FATAL, BhwcDepthFromShape(shape_) > 4) + << "OpenGlTexture2d supports depth <= 4. Current depth is " + << BhwcDepthFromShape(shape_); + LOG_IF(FATAL, valid_ == kValidNone) + << "Tensor must be written prior to read from."; + LOG_IF(FATAL, !(valid_ & (kValidCpu | kValidOpenGlTexture2d))) + << "Tensor conversion between different GPU resources is not supported " + "yet."; + auto lock = absl::make_unique(&view_mutex_); + AllocateOpenGlTexture2d(); + if (!(valid_ & kValidOpenGlTexture2d)) { + uint8_t* buffer; + std::unique_ptr temp_buffer; + if (BhwcDepthFromShape(shape_) == 4) { + buffer = reinterpret_cast(cpu_buffer_); + } else { + const int padded_depth = 4; + const int padded_depth_size = padded_depth * element_size(); + const int padded_size = BhwcBatchFromShape(shape_) * + BhwcHeightFromShape(shape_) * + BhwcWidthFromShape(shape_) * padded_depth_size; + temp_buffer = absl::make_unique(padded_size); + buffer = temp_buffer.get(); + uint8_t* src_buffer = reinterpret_cast(cpu_buffer_); + const int actual_depth_size = BhwcDepthFromShape(shape_) * element_size(); + for (int e = 0; + e < BhwcBatchFromShape(shape_) * BhwcHeightFromShape(shape_) * + BhwcWidthFromShape(shape_); + e++) { + std::memcpy(buffer, src_buffer, actual_depth_size); + src_buffer += actual_depth_size; + buffer += padded_depth_size; + } + } + // Transfer from CPU memory into GPU memory. + glBindTexture(GL_TEXTURE_2D, opengl_texture2d_); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, BhwcWidthFromShape(shape_), + BhwcHeightFromShape(shape_), GL_RGBA, GL_FLOAT, buffer); + glBindTexture(GL_TEXTURE_2D, 0); + valid_ |= kValidOpenGlTexture2d; + } + return {opengl_texture2d_, std::move(lock)}; +} + +Tensor::OpenGlTexture2dView Tensor::GetOpenGlTexture2dWriteView() const { + auto lock = absl::make_unique(&view_mutex_); + AllocateOpenGlTexture2d(); + valid_ = kValidOpenGlTexture2d; + return {opengl_texture2d_, std::move(lock)}; +} + +void Tensor::AllocateOpenGlTexture2d() const { + if (opengl_texture2d_ == GL_INVALID_INDEX) { + gl_context_ = mediapipe::GlContext::GetCurrent(); + LOG_IF(FATAL, !gl_context_) << "GlContext is not bound to the thread."; + glGenTextures(1, &opengl_texture2d_); + glBindTexture(GL_TEXTURE_2D, opengl_texture2d_); + // Texture2D represents a buffer with computable data so should be fetched + // but not sampled - can affect performance. Also on GLES2.0 sampling is not + // supported from floating point textures. + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA32F, BhwcWidthFromShape(shape_), + BhwcHeightFromShape(shape_)); + glBindTexture(GL_TEXTURE_2D, 0); + } +} +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 +Tensor::OpenGlBufferView Tensor::GetOpenGlBufferReadView() const { + LOG_IF(FATAL, valid_ == kValidNone) + << "Tensor must be written prior to read from."; + LOG_IF(FATAL, !(valid_ & (kValidCpu | kValidOpenGlBuffer))) + << "Tensor conversion between different GPU resources is not supported " + "yet."; + auto lock(absl::make_unique(&view_mutex_)); + AllocateOpenGlBuffer(); + if (!(valid_ & kValidOpenGlBuffer)) { + glBindBuffer(GL_SHADER_STORAGE_BUFFER, opengl_buffer_); + void* ptr = + glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, bytes(), + GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_WRITE_BIT); + std::memcpy(ptr, cpu_buffer_, bytes()); + glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); + valid_ |= kValidOpenGlBuffer; + } + return {opengl_buffer_, std::move(lock)}; +} + +Tensor::OpenGlBufferView Tensor::GetOpenGlBufferWriteView() const { + auto lock(absl::make_unique(&view_mutex_)); + AllocateOpenGlBuffer(); + valid_ = kValidOpenGlBuffer; + return {opengl_buffer_, std::move(lock)}; +} + +void Tensor::AllocateOpenGlBuffer() const { + if (opengl_buffer_ == GL_INVALID_INDEX) { + gl_context_ = mediapipe::GlContext::GetCurrent(); + LOG_IF(FATAL, !gl_context_) << "GlContext is not bound to the thread."; + glGenBuffers(1, &opengl_buffer_); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, opengl_buffer_); + glBufferData(GL_SHADER_STORAGE_BUFFER, bytes(), NULL, GL_STREAM_COPY); + } +} +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + +Tensor& Tensor::operator=(Tensor&& src) { + if (this != &src) { + Invalidate(); + Move(&src); + } + return *this; +} + +void Tensor::Move(Tensor* src) { + valid_ = src->valid_; + src->valid_ = kValidNone; + shape_ = src->shape(); + element_type_ = src->element_type(); + src->element_type_ = ElementType::kNone; // Mark as invalidated. + cpu_buffer_ = src->cpu_buffer_; + src->cpu_buffer_ = nullptr; +#if MEDIAPIPE_METAL_ENABLED + device_ = src->device_; + command_buffer_ = src->command_buffer_; + metal_buffer_ = src->metal_buffer_; + src->metal_buffer_ = nil; +#endif // MEDIAPIPE_METAL_ENABLED + +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + gl_context_ = std::move(src->gl_context_); + opengl_texture2d_ = src->opengl_texture2d_; + src->opengl_texture2d_ = GL_INVALID_INDEX; +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + opengl_buffer_ = src->opengl_buffer_; + src->opengl_buffer_ = GL_INVALID_INDEX; +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 +} + +Tensor::Tensor(ElementType element_type, const Shape& shape) + : element_type_(element_type), shape_(shape) {} + +void Tensor::Invalidate() { + absl::MutexLock lock(&view_mutex_); +#if MEDIAPIPE_METAL_ENABLED + // If memory is allocated and not owned by the metal buffer. + // TODO: Re-design cpu buffer memory management. + if (cpu_buffer_ && !metal_buffer_) { + DeallocateVirtualMemory(cpu_buffer_, AlignToPageSize(bytes())); + } + metal_buffer_ = nil; +#else + if (cpu_buffer_) { + free(cpu_buffer_); + } +#endif // MEDIAPIPE_METAL_ENABLED + cpu_buffer_ = nullptr; + + // Don't need to wait for the resource to be deleted bacause if will be + // released on last reference deletion inside the OpenGL driver. +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + if (opengl_texture2d_ != GL_INVALID_INDEX) { + GLuint opengl_texture2d = opengl_texture2d_; + gl_context_->RunWithoutWaiting( + [opengl_texture2d]() { glDeleteTextures(1, &opengl_texture2d); }); + opengl_texture2d_ = GL_INVALID_INDEX; + } +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + if (opengl_buffer_ != GL_INVALID_INDEX) { + GLuint opengl_buffer = opengl_buffer_; + gl_context_->RunWithoutWaiting( + [opengl_buffer]() { glDeleteBuffers(1, &opengl_buffer); }); + opengl_buffer_ = GL_INVALID_INDEX; + } +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 +} + +Tensor::CpuReadView Tensor::GetCpuReadView() const { + auto lock = absl::make_unique(&view_mutex_); + LOG_IF(FATAL, valid_ == kValidNone) + << "Tensor must be written prior to read from."; + AllocateCpuBuffer(); + if (!(valid_ & kValidCpu)) { + // GPU-to-CPU synchronization and read-back. +#if MEDIAPIPE_METAL_ENABLED + if (valid_ & kValidMetalBuffer) { + LOG_IF(FATAL, !command_buffer_) << "Metal -> CPU synchronization " + "requires MTLCommandBuffer to be set."; + if (command_buffer_) { + [command_buffer_ waitUntilCompleted]; + } + } +#endif // MEDIAPIPE_METAL_ENABLED + +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + if (valid_ & kValidOpenGlBuffer) { + gl_context_->Run([this]() { + glBindBuffer(GL_SHADER_STORAGE_BUFFER, opengl_buffer_); + const void* ptr = glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, bytes(), + GL_MAP_READ_BIT); + std::memcpy(cpu_buffer_, ptr, bytes()); + glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); + }); + } else +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + + // Transfer data from texture if not transferred from SSBO/MTLBuffer + // yet. + if (valid_ & kValidOpenGlTexture2d) { + gl_context_->Run([this]() { + GLint current_fbo; + glGetIntegerv(GL_FRAMEBUFFER_BINDING, ¤t_fbo); + + uint8_t* buffer; + std::unique_ptr temp_buffer; + if (BhwcDepthFromShape(shape_) == 4) { + buffer = reinterpret_cast(cpu_buffer_); + } else { + const int padded_depth = (BhwcDepthFromShape(shape_) + 3) / 4 * 4; + const int padded_size = + BhwcBatchFromShape(shape_) * BhwcHeightFromShape(shape_) * + BhwcWidthFromShape(shape_) * padded_depth * element_size(); + temp_buffer = absl::make_unique(padded_size); + buffer = temp_buffer.get(); + } + + GLint color_attachment_name; + glGetFramebufferAttachmentParameteriv( + GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME, &color_attachment_name); + if (color_attachment_name != opengl_texture2d_) { + // Save the viewport. Note that we assume that the color attachment is + // a GL_TEXTURE_2D texture. + GLint viewport[4]; + glGetIntegerv(GL_VIEWPORT, viewport); + + // Set the data from GLTexture object. + glViewport(0, 0, BhwcWidthFromShape(shape_), + BhwcHeightFromShape(shape_)); + glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + GL_TEXTURE_2D, opengl_texture2d_, 0); + glReadPixels(0, 0, BhwcWidthFromShape(shape_), + BhwcHeightFromShape(shape_), GL_RGBA, GL_FLOAT, buffer); + + // Restore from the saved viewport and color attachment name. + glViewport(viewport[0], viewport[1], viewport[2], viewport[3]); + glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + GL_TEXTURE_2D, color_attachment_name, 0); + } else { + glReadPixels(0, 0, BhwcWidthFromShape(shape_), + BhwcHeightFromShape(shape_), GL_RGBA, GL_FLOAT, buffer); + } + if (BhwcDepthFromShape(shape_) < 4) { + uint8_t* dest_buffer = reinterpret_cast(cpu_buffer_); + const int actual_depth_size = + BhwcDepthFromShape(shape_) * element_size(); + const int padded_depth_size = 4 * element_size(); + for (int e = 0; + e < BhwcBatchFromShape(shape_) * BhwcHeightFromShape(shape_) * + BhwcWidthFromShape(shape_); + e++) { + std::memcpy(dest_buffer, buffer, actual_depth_size); + dest_buffer += actual_depth_size; + buffer += padded_depth_size; + } + } + }); + } +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + valid_ |= kValidCpu; + } + return {cpu_buffer_, std::move(lock)}; +} + +Tensor::CpuWriteView Tensor::GetCpuWriteView() const { + auto lock = absl::make_unique(&view_mutex_); + AllocateCpuBuffer(); + valid_ = kValidCpu; + return {cpu_buffer_, std::move(lock)}; +} + +void Tensor::AllocateCpuBuffer() const { + if (!cpu_buffer_) { +#if MEDIAPIPE_METAL_ENABLED + cpu_buffer_ = AllocateVirtualMemory(bytes()); +#else + cpu_buffer_ = malloc(bytes()); +#endif // MEDIAPIPE_METAL_ENABLED + } +} + +} // namespace mediapipe diff --git a/mediapipe/framework/formats/tensor.h b/mediapipe/framework/formats/tensor.h new file mode 100644 index 0000000000..b6a55145d3 --- /dev/null +++ b/mediapipe/framework/formats/tensor.h @@ -0,0 +1,266 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_FRAMEWORK_FORMATS_TENSOR_H_ +#define MEDIAPIPE_FRAMEWORK_FORMATS_TENSOR_H_ + +#include +#include +#include +#include +#include +#include + +#include "absl/memory/memory.h" +#include "absl/synchronization/mutex.h" +#include "mediapipe/framework/port.h" + +#if MEDIAPIPE_METAL_ENABLED +#import +#endif // MEDIAPIPE_METAL_ENABLED + +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 +#include "mediapipe/gpu/gl_base.h" +#include "mediapipe/gpu/gl_context.h" +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + +namespace mediapipe { + +// Tensor is a container of multi-dimensional data that supports sharing the +// content across different backends and APIs, currently: CPU / Metal / OpenGL. +// Texture2DView is limited to 4 dimensions. +// The content is accessible through requesting device specific views. +// Acquiring a view guarantees that the content is not changed by another thread +// until the view is released. +// +// Tensor::MtlBufferView view = tensor.GetMtlBufferWriteView(mtl_device); +// mtl_device is used to create MTLBuffer +// id buffer = view.buffer(); +// For OpenGL the code below must be called by a thread with valid OpenGL ES +// context bound: +// GLuint buffer = view.buffer(); +// Then the buffer can be bound to the GPU command buffer. +// ...binding the buffer to the command buffer... +// ...commiting command buffer and releasing the view... +// +// The following request for the CPU view will be blocked until the GPU view is +// released and the GPU task is finished. +// +// auto view = tensor.GetCpuReadView(); +// float* pointer = view.buffer(); +// ...reading the cpu memory... + +class Tensor { + class View { + public: + // Non-copyable. + View(const View&) = delete; + View& operator=(const View&) = delete; + View(View&& src) = default; + + protected: + View(std::unique_ptr&& lock) : lock_(std::move(lock)) {} + std::unique_ptr lock_; + }; + + public: + // No resources are allocated here. + enum class ElementType { kNone, kFloat16, kFloat32 }; + struct Shape { + Shape() = default; + Shape(std::initializer_list dimensions) : dims(dimensions) {} + Shape(const std::vector& dimensions) : dims(dimensions) {} + int num_elements() const { + int res = dims.empty() ? 0 : 1; + std::for_each(dims.begin(), dims.end(), [&res](int i) { res *= i; }); + return res; + } + std::vector dims; + }; + + Tensor(ElementType element_type, const Shape& shape); + + // Non-copyable. + Tensor(const Tensor&) = delete; + Tensor& operator=(const Tensor&) = delete; + // Move-only. + Tensor(Tensor&& src) { Move(&src); } + Tensor& operator=(Tensor&&); + ~Tensor() { Invalidate(); } + + template + class CpuView : public View { + public: + template + auto buffer() const { + // const and non-const return type selection. + return static_cast::value, std::tuple >::type>(buffer_); + } + CpuView(CpuView&& src) : View(std::move(src)), buffer_(src.buffer_) { + src.buffer_ = nullptr; + } + + protected: + friend class Tensor; + CpuView(T* buffer, std::unique_ptr&& lock) + : View(std::move(lock)), buffer_(buffer) {} + T* buffer_; + }; + using CpuReadView = CpuView; + CpuReadView GetCpuReadView() const; + using CpuWriteView = CpuView; + CpuWriteView GetCpuWriteView() const; + +#if MEDIAPIPE_METAL_ENABLED + // TODO: id vs. MtlBufferView. + class MtlBufferView : public View { + public: + id buffer() const { return buffer_; } + MtlBufferView(MtlBufferView&& src) + : View(std::move(src)), buffer_(src.buffer_) { + src.buffer_ = nil; + } + + protected: + friend class Tensor; + MtlBufferView(id buffer, std::unique_ptr&& lock) + : View(std::move(lock)), buffer_(buffer) {} + id buffer_; + }; + // The command buffer status is checked for completeness if GPU-to-CPU + // synchronization is required. + // TODO: Design const and non-const view acquiring. + MtlBufferView GetMtlBufferReadView(id command_buffer) const; + MtlBufferView GetMtlBufferWriteView( + id command_buffer) const; + // Allocate new buffer. + // TODO: GPU-to-CPU design considerations. + MtlBufferView GetMtlBufferWriteView(id device) const; +#endif // MEDIAPIPE_METAL_ENABLED + +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + // TODO: Use GlTextureView instead. + // Only float32 textures are supported with 1/2/3/4 depths. + // OpenGlTexture2dView currently only supports BHWC memory layout. + class OpenGlTexture2dView : public View { + public: + GLuint name() const { return name_; } + OpenGlTexture2dView(OpenGlTexture2dView&& src) + : View(std::move(src)), name_(src.name_) { + src.name_ = GL_INVALID_INDEX; + } + + protected: + friend class Tensor; + OpenGlTexture2dView(GLuint name, std::unique_ptr&& lock) + : View(std::move(lock)), name_(name) {} + GLuint name_; + }; + // A valid OpenGL context must be bound to the calling thread due to possible + // GPU resource allocation. + OpenGlTexture2dView GetOpenGlTexture2dReadView() const; + OpenGlTexture2dView GetOpenGlTexture2dWriteView() const; +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + class OpenGlBufferView : public View { + public: + GLuint name() const { return name_; } + OpenGlBufferView(OpenGlBufferView&& src) + : View(std::move(src)), name_(src.name_) { + src.name_ = GL_INVALID_INDEX; + } + + protected: + friend class Tensor; + OpenGlBufferView(GLuint name, std::unique_ptr&& lock) + : View(std::move(lock)), name_(name) {} + GLuint name_; + }; + // A valid OpenGL context must be bound to the calling thread due to possible + // GPU resource allocation. + OpenGlBufferView GetOpenGlBufferReadView() const; + OpenGlBufferView GetOpenGlBufferWriteView() const; +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + + const Shape& shape() const { return shape_; } + ElementType element_type() const { return element_type_; } + int element_size() const { + switch (element_type_) { + case ElementType::kNone: + return 0; + case ElementType::kFloat16: + return 2; + case ElementType::kFloat32: + return sizeof(float); + } + } + int bytes() const { return shape_.num_elements() * element_size(); } + + bool ready_on_cpu() const { return valid_ & kValidCpu; } + bool ready_on_gpu() const { + return valid_ & + (kValidMetalBuffer | kValidOpenGlBuffer | kValidOpenGlTexture2d); + } + bool ready_as_metal_buffer() const { return valid_ & kValidMetalBuffer; } + bool ready_as_opengl_buffer() const { return valid_ & kValidOpenGlBuffer; } + bool ready_as_opengl_texture_2d() const { + return valid_ & kValidOpenGlTexture2d; + } + + private: + void Move(Tensor*); + void Invalidate(); + + ElementType element_type_; + Shape shape_; + + // The flags describe the current source of truth resource type. + enum { + kValidNone = 0, + kValidCpu = 1 << 0, + kValidMetalBuffer = 1 << 1, + kValidOpenGlBuffer = 1 << 2, + kValidOpenGlTexture2d = 1 << 3, + }; + // A list of resource which are currently allocated and synchronized between + // each-other: valid_ = kValidCpu | kValidMetalBuffer; + mutable int valid_ = 0; + // The mutex is locked by Get*View and is kept by all Views. + mutable absl::Mutex view_mutex_; + + mutable void* cpu_buffer_ = nullptr; + void AllocateCpuBuffer() const; +#if MEDIAPIPE_METAL_ENABLED + mutable id command_buffer_; + mutable id device_; + mutable id metal_buffer_; + void AllocateMtlBuffer(id device) const; +#endif // MEDIAPIPE_METAL_ENABLED + +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 + mutable std::shared_ptr gl_context_; + mutable GLuint opengl_texture2d_ = GL_INVALID_INDEX; + void AllocateOpenGlTexture2d() const; +#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 + mutable GLuint opengl_buffer_ = GL_INVALID_INDEX; + void AllocateOpenGlBuffer() const; +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31 +#endif // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30 +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_FRAMEWORK_FORMATS_TENSOR_H_ diff --git a/mediapipe/framework/formats/tensor_test.cc b/mediapipe/framework/formats/tensor_test.cc new file mode 100644 index 0000000000..42c86fd4c0 --- /dev/null +++ b/mediapipe/framework/formats/tensor_test.cc @@ -0,0 +1,62 @@ +#include "mediapipe/framework/formats/tensor.h" + +#include "mediapipe/framework/port/gmock.h" +#include "mediapipe/framework/port/gtest.h" +#if !defined(MEDIAPIPE_DISABLE_GPU) +#include "mediapipe/gpu/gl_calculator_helper.h" +#include "mediapipe/gpu/gpu_buffer_format.h" +#endif + +namespace mediapipe { + +TEST(General, TestDimensions) { + Tensor t1(Tensor::ElementType::kFloat32, Tensor::Shape{1, 2, 3, 4}); + EXPECT_EQ(t1.shape().num_elements(), 1 * 2 * 3 * 4); + + Tensor t2(Tensor::ElementType::kFloat16, Tensor::Shape{4, 3, 2, 3}); + EXPECT_EQ(t2.shape().num_elements(), 4 * 3 * 2 * 3); +} + +TEST(General, TestDataTypes) { + Tensor t1(Tensor::ElementType::kFloat32, Tensor::Shape{1, 2, 3, 4}); + EXPECT_EQ(t1.bytes(), t1.shape().num_elements() * sizeof(float)); + + Tensor t2(Tensor::ElementType::kFloat16, Tensor::Shape{4, 3, 2, 3}); + EXPECT_EQ(t2.bytes(), t2.shape().num_elements() * 2); +} + +TEST(Cpu, TestMemoryAllocation) { + Tensor t1(Tensor::ElementType::kFloat32, Tensor::Shape{4, 3, 2, 3}); + auto v1 = t1.GetCpuWriteView(); + float* f1 = v1.buffer(); + EXPECT_NE(f1, nullptr); +} + +TEST(Cpu, TestTensorMove) { + Tensor t1(Tensor::ElementType::kFloat32, Tensor::Shape{4, 3, 2, 3}); + void* p1 = t1.GetCpuWriteView().buffer(); + EXPECT_NE(p1, nullptr); + Tensor t2(std::move(t1)); + EXPECT_NE(t2.bytes(), 0); + EXPECT_EQ(t1.bytes(), 0); // NOLINT + void* p2 = t2.GetCpuWriteView().buffer(); + EXPECT_EQ(p1, p2); +} + +TEST(Cpu, TestViewMove) { + Tensor t(Tensor::ElementType::kFloat32, Tensor::Shape{4, 3, 2, 3}); + auto v1 = t.GetCpuWriteView(); + auto p1 = v1.buffer(); + EXPECT_NE(p1, nullptr); + Tensor::CpuWriteView v2(std::move(v1)); + auto p2 = v2.buffer(); + EXPECT_EQ(p1, p2); + EXPECT_EQ(v1.buffer(), nullptr); // NOLINT +} + +} // namespace mediapipe + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/mediapipe/framework/legacy_calculator_support.h b/mediapipe/framework/legacy_calculator_support.h index 75ffa68e00..9378d14f0b 100644 --- a/mediapipe/framework/legacy_calculator_support.h +++ b/mediapipe/framework/legacy_calculator_support.h @@ -61,7 +61,7 @@ class LegacyCalculatorSupport { // platforms. #ifndef __APPLE__ ABSL_CONST_INIT -#endif // !__APPLE__ +#endif // !__APPLE__ static thread_local C* current_; // NOLINT }; }; diff --git a/mediapipe/framework/port.h b/mediapipe/framework/port.h index d88752d9d1..521620520d 100644 --- a/mediapipe/framework/port.h +++ b/mediapipe/framework/port.h @@ -73,10 +73,13 @@ #elif defined(MEDIAPIPE_OSX) #define MEDIAPIPE_OPENGL_ES_VERSION 0 #define MEDIAPIPE_METAL_ENABLED 1 -#else +#elif defined(__EMSCRIPTEN__) // WebGL config. #define MEDIAPIPE_OPENGL_ES_VERSION MEDIAPIPE_OPENGL_ES_30 #define MEDIAPIPE_METAL_ENABLED 0 +#else +#define MEDIAPIPE_OPENGL_ES_VERSION MEDIAPIPE_OPENGL_ES_31 +#define MEDIAPIPE_METAL_ENABLED 0 #endif #endif diff --git a/mediapipe/framework/tool/BUILD b/mediapipe/framework/tool/BUILD index 2341d00232..1e4624c825 100644 --- a/mediapipe/framework/tool/BUILD +++ b/mediapipe/framework/tool/BUILD @@ -132,6 +132,7 @@ cc_library( visibility = ["//mediapipe/framework:mediapipe_internal"], deps = [ "//mediapipe/framework:calculator_cc_proto", + "//mediapipe/framework:input_stream_shard", "//mediapipe/framework:packet", "//mediapipe/framework:packet_generator_cc_proto", "//mediapipe/framework:packet_set", diff --git a/mediapipe/framework/tool/options_util.h b/mediapipe/framework/tool/options_util.h index 3a97e6bd13..ee946ff23f 100644 --- a/mediapipe/framework/tool/options_util.h +++ b/mediapipe/framework/tool/options_util.h @@ -16,6 +16,7 @@ #define MEDIAPIPE_FRAMEWORK_TOOL_OPTIONS_UTIL_H_ #include "mediapipe/framework/calculator.pb.h" +#include "mediapipe/framework/input_stream_shard.h" #include "mediapipe/framework/packet.h" #include "mediapipe/framework/packet_generator.pb.h" #include "mediapipe/framework/packet_set.h" @@ -96,21 +97,42 @@ void GetNodeOptions(const CalculatorGraphConfig::Node& node_config, T* result) { // packet can hold either the specified options type T or CalculatorOptions. // Fields are either replaced or merged depending on field merge_fields. template -inline T RetrieveOptions(const T& base, const PacketSet& packet_set, - const std::string& tag_name) { - if (packet_set.HasTag(tag_name)) { - const Packet& packet = packet_set.Tag(tag_name); +inline T RetrieveOptions(const T& base, const Packet& options_packet) { + if (!options_packet.IsEmpty()) { T packet_options; - if (packet.ValidateAsType().ok()) { - packet_options = packet.Get(); - } else if (packet.ValidateAsType().ok()) { - GetExtension(packet.Get(), &packet_options); + if (options_packet.ValidateAsType().ok()) { + packet_options = options_packet.Get(); + } else if (options_packet.ValidateAsType().ok()) { + GetExtension(options_packet.Get(), &packet_options); } return tool::MergeOptions(base, packet_options); } return base; } +// Combine a base options message with an optional side packet from +// a PacketSet such as a calculator's input-side-packets. +template +inline T RetrieveOptions(const T& base, const PacketSet& packet_set, + const std::string& tag_name = "OPTIONS") { + if (packet_set.HasTag(tag_name)) { + return tool::RetrieveOptions(base, packet_set.Tag(tag_name)); + } + return base; +} + +// Combine a base options message with an optional input packet from +// an InputStreamShardSet such as a calculator's input streams. +template +inline T RetrieveOptions(const T& base, const InputStreamShardSet& stream_set, + const std::string& tag_name = "OPTIONS") { + if (stream_set.HasTag(tag_name)) { + Packet options_packet = stream_set.Tag(tag_name).Value(); + return tool::RetrieveOptions(base, options_packet); + } + return base; +} + // Extracts the options message of a specified type from a // CalculatorGraphConfig::Node. class OptionsMap { diff --git a/mediapipe/gpu/gl_quad_renderer.cc b/mediapipe/gpu/gl_quad_renderer.cc index c66b26540e..38a1f35ee7 100644 --- a/mediapipe/gpu/gl_quad_renderer.cc +++ b/mediapipe/gpu/gl_quad_renderer.cc @@ -84,6 +84,7 @@ ::mediapipe::Status QuadRenderer::GlSetup( scale_unif_ = glGetUniformLocation(program_, "scale"); RET_CHECK(scale_unif_ != -1) << "could not find uniform 'scale'"; + glGenVertexArrays(1, &vao_); glGenBuffers(2, vbo_); return ::mediapipe::OkStatus(); @@ -94,6 +95,10 @@ void QuadRenderer::GlTeardown() { glDeleteProgram(program_); program_ = 0; } + if (vao_) { + glDeleteVertexArrays(1, &vao_); + vao_ = 0; + } if (vbo_[0]) { glDeleteBuffers(2, vbo_); vbo_[0] = 0; @@ -166,6 +171,7 @@ ::mediapipe::Status QuadRenderer::GlRender( // TODO: In practice, our vertex attributes almost never change, so // convert this to being actually static, with initialization done in the // GLSetup. + glBindVertexArray(vao_); glEnableVertexAttribArray(ATTRIB_VERTEX); glBindBuffer(GL_ARRAY_BUFFER, vbo_[0]); glBufferData(GL_ARRAY_BUFFER, sizeof(mediapipe::kBasicSquareVertices), @@ -187,6 +193,7 @@ ::mediapipe::Status QuadRenderer::GlRender( glDisableVertexAttribArray(ATTRIB_VERTEX); glDisableVertexAttribArray(ATTRIB_TEXTURE_POSITION); glBindBuffer(GL_ARRAY_BUFFER, 0); + glBindVertexArray(0); return ::mediapipe::OkStatus(); } diff --git a/mediapipe/gpu/gl_quad_renderer.h b/mediapipe/gpu/gl_quad_renderer.h index 50a32dad48..e5fd06c708 100644 --- a/mediapipe/gpu/gl_quad_renderer.h +++ b/mediapipe/gpu/gl_quad_renderer.h @@ -83,6 +83,7 @@ class QuadRenderer { GLuint program_ = 0; GLint scale_unif_ = -1; std::vector frame_unifs_; + GLuint vao_; // vertex array object GLuint vbo_[2] = {0, 0}; // for vertex buffer storage }; diff --git a/mediapipe/gpu/gl_scaler_calculator.cc b/mediapipe/gpu/gl_scaler_calculator.cc index d8d90a5245..8806267be2 100644 --- a/mediapipe/gpu/gl_scaler_calculator.cc +++ b/mediapipe/gpu/gl_scaler_calculator.cc @@ -215,7 +215,7 @@ ::mediapipe::Status GlScalerCalculator::Process(CalculatorContext* cc) { src1 = helper_.CreateSourceTexture(input, 0); src2 = helper_.CreateSourceTexture(input, 1); } else // NOLINT(readability/braces) -#endif // __APPLE__ +#endif // __APPLE__ { src1 = helper_.CreateSourceTexture(input); #ifdef __ANDROID__ @@ -227,7 +227,7 @@ ::mediapipe::Status GlScalerCalculator::Process(CalculatorContext* cc) { } renderer = ext_rgb_renderer_.get(); } else // NOLINT(readability/braces) -#endif // __ANDROID__ +#endif // __ANDROID__ { if (!rgb_renderer_) { rgb_renderer_ = absl::make_unique(); diff --git a/mediapipe/gpu/gl_simple_shaders.cc b/mediapipe/gpu/gl_simple_shaders.cc index b9bef4c5a1..1e6eefb5aa 100644 --- a/mediapipe/gpu/gl_simple_shaders.cc +++ b/mediapipe/gpu/gl_simple_shaders.cc @@ -69,6 +69,7 @@ namespace mediapipe { "#elif __VERSION__ > 320 && !defined(GL_ES)\n" \ "out vec4 frag_out; \n" \ "#define gl_FragColor frag_out\n" \ + "#define texture2D texture\n" \ "#endif // __VERSION__ < 130\n" const GLchar* const kMediaPipeVertexShaderPreamble = VERTEX_PREAMBLE; @@ -104,7 +105,7 @@ const GLchar* const kBasicTexturedFragmentShader = FRAGMENT_PREAMBLE _STRINGIFY( in mediump vec2 sample_coordinate; // texture coordinate (0..1) uniform sampler2D video_frame; - void main() { gl_FragColor = texture2D(video_frame, sample_coordinate); }); + void main() { gl_FragColor = texture(video_frame, sample_coordinate); }); const GLchar* const kBasicTexturedFragmentShaderOES = FRAGMENT_PREAMBLE "#extension GL_OES_EGL_image_external : require\n" _STRINGIFY( @@ -114,7 +115,7 @@ const GLchar* const kBasicTexturedFragmentShaderOES = FRAGMENT_PREAMBLE uniform samplerExternalOES video_frame; void main() { - gl_FragColor = texture2D(video_frame, sample_coordinate); + gl_FragColor = texture(video_frame, sample_coordinate); }); const GLchar* const kFlatColorFragmentShader = FRAGMENT_PREAMBLE _STRINGIFY( @@ -131,7 +132,7 @@ const GLchar* const kRgbWeightFragmentShader = FRAGMENT_PREAMBLE _STRINGIFY( uniform sampler2D video_frame; uniform vec3 weights; // r,g,b weights void main() { - vec4 color = texture2D(video_frame, sample_coordinate); + vec4 color = texture(video_frame, sample_coordinate); gl_FragColor.bgra = vec4(weights.z * color.b, weights.y * color.g, weights.x * color.r, color.a); }); @@ -145,10 +146,10 @@ const GLchar* const kYUV2TexToRGBFragmentShader = FRAGMENT_PREAMBLE _STRINGIFY( void main() { mediump vec3 yuv; lowp vec3 rgb; - yuv.r = texture2D(video_frame_y, sample_coordinate).r; + yuv.r = texture(video_frame_y, sample_coordinate).r; // Subtract (0.5, 0.5) because conversion is done assuming UV color // midpoint of (128, 128). - yuv.gb = texture2D(video_frame_uv, sample_coordinate).rg - vec2(0.5, 0.5); + yuv.gb = texture(video_frame_uv, sample_coordinate).rg - vec2(0.5, 0.5); // Using BT.709 which is the standard for HDTV. rgb = mat3(1, 1, 1, 0, -0.18732, 1.8556, 1.57481, -0.46813, 0) * yuv; gl_FragColor = vec4(rgb, 1); diff --git a/mediapipe/graphs/face_detection/BUILD b/mediapipe/graphs/face_detection/BUILD index ac54089b22..40dbc63eda 100644 --- a/mediapipe/graphs/face_detection/BUILD +++ b/mediapipe/graphs/face_detection/BUILD @@ -20,35 +20,32 @@ cc_library( name = "mobile_calculators", deps = [ "//mediapipe/calculators/core:flow_limiter_calculator", - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:ssd_anchors_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", "//mediapipe/calculators/util:annotation_overlay_calculator", - "//mediapipe/calculators/util:detection_label_id_to_text_calculator", - "//mediapipe/calculators/util:detection_letterbox_removal_calculator", "//mediapipe/calculators/util:detections_to_render_data_calculator", - "//mediapipe/calculators/util:non_max_suppression_calculator", "//mediapipe/gpu:gpu_buffer_to_image_frame_calculator", "//mediapipe/gpu:image_frame_to_gpu_buffer_calculator", + "//mediapipe/modules/face_detection:face_detection_front_cpu", + "//mediapipe/modules/face_detection:face_detection_front_gpu", ], ) cc_library( - name = "desktop_tflite_calculators", + name = "desktop_live_calculators", deps = [ "//mediapipe/calculators/core:flow_limiter_calculator", - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:ssd_anchors_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", "//mediapipe/calculators/util:annotation_overlay_calculator", - "//mediapipe/calculators/util:detection_label_id_to_text_calculator", - "//mediapipe/calculators/util:detection_letterbox_removal_calculator", "//mediapipe/calculators/util:detections_to_render_data_calculator", - "//mediapipe/calculators/util:non_max_suppression_calculator", + "//mediapipe/modules/face_detection:face_detection_front_cpu", + ], +) + +cc_library( + name = "desktop_live_gpu_calculators", + deps = [ + "//mediapipe/calculators/core:flow_limiter_calculator", + "//mediapipe/calculators/util:annotation_overlay_calculator", + "//mediapipe/calculators/util:detections_to_render_data_calculator", + "//mediapipe/modules/face_detection:face_detection_front_gpu", ], ) @@ -58,15 +55,15 @@ load( ) mediapipe_binary_graph( - name = "mobile_cpu_binary_graph", + name = "face_detection_mobile_cpu_binary_graph", graph = "face_detection_mobile_cpu.pbtxt", - output_name = "mobile_cpu.binarypb", + output_name = "face_detection_mobile_cpu.binarypb", deps = [":mobile_calculators"], ) mediapipe_binary_graph( - name = "mobile_gpu_binary_graph", + name = "face_detection_mobile_gpu_binary_graph", graph = "face_detection_mobile_gpu.pbtxt", - output_name = "mobile_gpu.binarypb", + output_name = "face_detection_mobile_gpu.binarypb", deps = [":mobile_calculators"], ) diff --git a/mediapipe/graphs/face_detection/face_detection_desktop_live.pbtxt b/mediapipe/graphs/face_detection/face_detection_desktop_live.pbtxt index dd29961392..06478a7aa9 100644 --- a/mediapipe/graphs/face_detection/face_detection_desktop_live.pbtxt +++ b/mediapipe/graphs/face_detection/face_detection_desktop_live.pbtxt @@ -1,28 +1,27 @@ -# MediaPipe graph that performs face detection with TensorFlow Lite on CPU. -# Used in the examples in -# mediapipe/examples/desktop/face_detection:face_detection_cpu. +# MediaPipe graph that performs face mesh with TensorFlow Lite on CPU. -# Images on GPU coming into and out of the graph. +# CPU buffer. (ImageFrame) input_stream: "input_video" + +# Output image with rendered results. (ImageFrame) output_stream: "output_video" +# Detected faces. (std::vector) +output_stream: "face_detections" # Throttles the images flowing downstream for flow control. It passes through -# the very first incoming image unaltered, and waits for -# TfLiteTensorsToDetectionsCalculator downstream in the graph to finish -# generating the corresponding detections before it passes through another -# image. All images that come in while waiting are dropped, limiting the number -# of in-flight images between this calculator and -# TfLiteTensorsToDetectionsCalculator to 1. This prevents the nodes in between -# from queuing up incoming images and data excessively, which leads to increased -# latency and memory usage, unwanted in real-time mobile applications. It also -# eliminates unnecessarily computation, e.g., a transformed image produced by -# ImageTransformationCalculator may get dropped downstream if the subsequent -# TfLiteConverterCalculator or TfLiteInferenceCalculator is still busy -# processing previous inputs. +# the very first incoming image unaltered, and waits for downstream nodes +# (calculators and subgraphs) in the graph to finish their tasks before it +# passes through another image. All images that come in while waiting are +# dropped, limiting the number of in-flight images in most part of the graph to +# 1. This prevents the downstream nodes from queuing up incoming images and data +# excessively, which leads to increased latency and memory usage, unwanted in +# real-time mobile applications. It also eliminates unnecessarily computation, +# e.g., the output produced by a node may get dropped downstream if the +# subsequent nodes are still busy processing previous inputs. node { calculator: "FlowLimiterCalculator" input_stream: "input_video" - input_stream: "FINISHED:detections" + input_stream: "FINISHED:output_video" input_stream_info: { tag_index: "FINISHED" back_edge: true @@ -30,141 +29,17 @@ node { output_stream: "throttled_input_video" } -# Transforms the input image on CPU to a 128x128 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE:throttled_input_video" - output_stream: "IMAGE:transformed_input_video_cpu" - output_stream: "LETTERBOX_PADDING:letterbox_padding" - node_options: { - [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { - output_width: 128 - output_height: 128 - scale_mode: FIT - } - } -} - -# Converts the transformed input image on CPU into an image tensor stored as a -# TfLiteTensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE:transformed_input_video_cpu" - output_stream: "TENSORS:image_tensor" -} - -# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a -# vector of tensors representing, for instance, detection boxes/keypoints and -# scores. -node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS:image_tensor" - output_stream: "TENSORS:detection_tensors" - node_options: { - [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { - model_path: "mediapipe/models/face_detection_front.tflite" - } - } -} - -# Generates a single side packet containing a vector of SSD anchors based on -# the specification in the options. -node { - calculator: "SsdAnchorsCalculator" - output_side_packet: "anchors" - node_options: { - [type.googleapis.com/mediapipe.SsdAnchorsCalculatorOptions] { - num_layers: 4 - min_scale: 0.1484375 - max_scale: 0.75 - input_size_height: 128 - input_size_width: 128 - anchor_offset_x: 0.5 - anchor_offset_y: 0.5 - strides: 8 - strides: 16 - strides: 16 - strides: 16 - aspect_ratios: 1.0 - fixed_anchor_size: true - } - } -} - -# Decodes the detection tensors generated by the TensorFlow Lite model, based on -# the SSD anchors and the specification in the options, into a vector of -# detections. Each detection describes a detected object. +# Subgraph that detects faces. node { - calculator: "TfLiteTensorsToDetectionsCalculator" - input_stream: "TENSORS:detection_tensors" - input_side_packet: "ANCHORS:anchors" - output_stream: "DETECTIONS:detections" - node_options: { - [type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] { - num_classes: 1 - num_boxes: 896 - num_coords: 16 - box_coord_offset: 0 - keypoint_coord_offset: 4 - num_keypoints: 6 - num_values_per_keypoint: 2 - sigmoid_score: true - score_clipping_thresh: 100.0 - reverse_output_order: true - x_scale: 128.0 - y_scale: 128.0 - h_scale: 128.0 - w_scale: 128.0 - min_score_thresh: 0.5 - } - } -} - -# Performs non-max suppression to remove excessive detections. -node { - calculator: "NonMaxSuppressionCalculator" - input_stream: "detections" - output_stream: "filtered_detections" - node_options: { - [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { - min_suppression_threshold: 0.3 - overlap_type: INTERSECTION_OVER_UNION - algorithm: WEIGHTED - return_empty_detections: true - } - } -} - -# Maps detection label IDs to the corresponding label text ("Face"). The label -# map is provided in the label_map_path option. -node { - calculator: "DetectionLabelIdToTextCalculator" - input_stream: "filtered_detections" - output_stream: "labeled_detections" - node_options: { - [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { - label_map_path: "mediapipe/models/face_detection_front_labelmap.txt" - } - } -} - -# Adjusts detection locations (already normalized to [0.f, 1.f]) on the -# letterboxed image (after image transformation with the FIT scale mode) to the -# corresponding locations on the same image with the letterbox removed (the -# input image to the graph before image transformation). -node { - calculator: "DetectionLetterboxRemovalCalculator" - input_stream: "DETECTIONS:labeled_detections" - input_stream: "LETTERBOX_PADDING:letterbox_padding" - output_stream: "DETECTIONS:output_detections" + calculator: "FaceDetectionFrontCpu" + input_stream: "IMAGE:throttled_input_video" + output_stream: "DETECTIONS:face_detections" } # Converts the detections to drawing primitives for annotation overlay. node { calculator: "DetectionsToRenderDataCalculator" - input_stream: "DETECTIONS:output_detections" + input_stream: "DETECTIONS:face_detections" output_stream: "RENDER_DATA:render_data" node_options: { [type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] { @@ -181,4 +56,3 @@ node { input_stream: "render_data" output_stream: "IMAGE:output_video" } - diff --git a/mediapipe/graphs/face_detection/face_detection_mobile_cpu.pbtxt b/mediapipe/graphs/face_detection/face_detection_mobile_cpu.pbtxt index f3ae28b0d4..ac9c667a88 100644 --- a/mediapipe/graphs/face_detection/face_detection_mobile_cpu.pbtxt +++ b/mediapipe/graphs/face_detection/face_detection_mobile_cpu.pbtxt @@ -1,29 +1,27 @@ -# MediaPipe graph that performs face detection with TensorFlow Lite on CPU. -# Used in the examples in -# mediapipie/examples/android/src/java/com/mediapipe/apps/facedetectioncpu and -# mediapipie/examples/ios/facedetectioncpu. +# MediaPipe graph that performs face mesh with TensorFlow Lite on CPU. -# Images on GPU coming into and out of the graph. +# GPU buffer. (GpuBuffer) input_stream: "input_video" + +# Output image with rendered results. (GpuBuffer) output_stream: "output_video" +# Detected faces. (std::vector) +output_stream: "face_detections" # Throttles the images flowing downstream for flow control. It passes through -# the very first incoming image unaltered, and waits for -# TfLiteTensorsToDetectionsCalculator downstream in the graph to finish -# generating the corresponding detections before it passes through another -# image. All images that come in while waiting are dropped, limiting the number -# of in-flight images between this calculator and -# TfLiteTensorsToDetectionsCalculator to 1. This prevents the nodes in between -# from queuing up incoming images and data excessively, which leads to increased -# latency and memory usage, unwanted in real-time mobile applications. It also -# eliminates unnecessarily computation, e.g., a transformed image produced by -# ImageTransformationCalculator may get dropped downstream if the subsequent -# TfLiteConverterCalculator or TfLiteInferenceCalculator is still busy -# processing previous inputs. +# the very first incoming image unaltered, and waits for downstream nodes +# (calculators and subgraphs) in the graph to finish their tasks before it +# passes through another image. All images that come in while waiting are +# dropped, limiting the number of in-flight images in most part of the graph to +# 1. This prevents the downstream nodes from queuing up incoming images and data +# excessively, which leads to increased latency and memory usage, unwanted in +# real-time mobile applications. It also eliminates unnecessarily computation, +# e.g., the output produced by a node may get dropped downstream if the +# subsequent nodes are still busy processing previous inputs. node { calculator: "FlowLimiterCalculator" input_stream: "input_video" - input_stream: "FINISHED:detections" + input_stream: "FINISHED:output_video" input_stream_info: { tag_index: "FINISHED" back_edge: true @@ -41,141 +39,17 @@ node: { output_stream: "input_video_cpu" } -# Transforms the input image on CPU to a 128x128 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE:input_video_cpu" - output_stream: "IMAGE:transformed_input_video_cpu" - output_stream: "LETTERBOX_PADDING:letterbox_padding" - node_options: { - [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { - output_width: 128 - output_height: 128 - scale_mode: FIT - } - } -} - -# Converts the transformed input image on CPU into an image tensor stored as a -# TfLiteTensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE:transformed_input_video_cpu" - output_stream: "TENSORS:image_tensor" -} - -# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a -# vector of tensors representing, for instance, detection boxes/keypoints and -# scores. -node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS:image_tensor" - output_stream: "TENSORS:detection_tensors" - node_options: { - [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { - model_path: "mediapipe/models/face_detection_front.tflite" - } - } -} - -# Generates a single side packet containing a vector of SSD anchors based on -# the specification in the options. -node { - calculator: "SsdAnchorsCalculator" - output_side_packet: "anchors" - node_options: { - [type.googleapis.com/mediapipe.SsdAnchorsCalculatorOptions] { - num_layers: 4 - min_scale: 0.1484375 - max_scale: 0.75 - input_size_height: 128 - input_size_width: 128 - anchor_offset_x: 0.5 - anchor_offset_y: 0.5 - strides: 8 - strides: 16 - strides: 16 - strides: 16 - aspect_ratios: 1.0 - fixed_anchor_size: true - } - } -} - -# Decodes the detection tensors generated by the TensorFlow Lite model, based on -# the SSD anchors and the specification in the options, into a vector of -# detections. Each detection describes a detected object. +# Subgraph that detects faces. node { - calculator: "TfLiteTensorsToDetectionsCalculator" - input_stream: "TENSORS:detection_tensors" - input_side_packet: "ANCHORS:anchors" - output_stream: "DETECTIONS:detections" - node_options: { - [type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] { - num_classes: 1 - num_boxes: 896 - num_coords: 16 - box_coord_offset: 0 - keypoint_coord_offset: 4 - num_keypoints: 6 - num_values_per_keypoint: 2 - sigmoid_score: true - score_clipping_thresh: 100.0 - reverse_output_order: true - x_scale: 128.0 - y_scale: 128.0 - h_scale: 128.0 - w_scale: 128.0 - min_score_thresh: 0.5 - } - } -} - -# Performs non-max suppression to remove excessive detections. -node { - calculator: "NonMaxSuppressionCalculator" - input_stream: "detections" - output_stream: "filtered_detections" - node_options: { - [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { - min_suppression_threshold: 0.3 - overlap_type: INTERSECTION_OVER_UNION - algorithm: WEIGHTED - return_empty_detections: true - } - } -} - -# Maps detection label IDs to the corresponding label text ("Face"). The label -# map is provided in the label_map_path option. -node { - calculator: "DetectionLabelIdToTextCalculator" - input_stream: "filtered_detections" - output_stream: "labeled_detections" - node_options: { - [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { - label_map_path: "mediapipe/models/face_detection_front_labelmap.txt" - } - } -} - -# Adjusts detection locations (already normalized to [0.f, 1.f]) on the -# letterboxed image (after image transformation with the FIT scale mode) to the -# corresponding locations on the same image with the letterbox removed (the -# input image to the graph before image transformation). -node { - calculator: "DetectionLetterboxRemovalCalculator" - input_stream: "DETECTIONS:labeled_detections" - input_stream: "LETTERBOX_PADDING:letterbox_padding" - output_stream: "DETECTIONS:output_detections" + calculator: "FaceDetectionFrontCpu" + input_stream: "IMAGE:input_video_cpu" + output_stream: "DETECTIONS:face_detections" } # Converts the detections to drawing primitives for annotation overlay. node { calculator: "DetectionsToRenderDataCalculator" - input_stream: "DETECTIONS:output_detections" + input_stream: "DETECTIONS:face_detections" output_stream: "RENDER_DATA:render_data" node_options: { [type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] { diff --git a/mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt b/mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt index 8c79a6ce33..904246a59f 100644 --- a/mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt +++ b/mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt @@ -1,29 +1,27 @@ -# MediaPipe graph that performs face detection with TensorFlow Lite on GPU. -# Used in the examples in -# mediapipie/examples/android/src/java/com/mediapipe/apps/facedetectiongpu and -# mediapipie/examples/ios/facedetectiongpu. +# MediaPipe graph that performs face mesh with TensorFlow Lite on GPU. -# Images on GPU coming into and out of the graph. +# GPU buffer. (GpuBuffer) input_stream: "input_video" + +# Output image with rendered results. (GpuBuffer) output_stream: "output_video" +# Detected faces. (std::vector) +output_stream: "face_detections" # Throttles the images flowing downstream for flow control. It passes through -# the very first incoming image unaltered, and waits for -# TfLiteTensorsToDetectionsCalculator downstream in the graph to finish -# generating the corresponding detections before it passes through another -# image. All images that come in while waiting are dropped, limiting the number -# of in-flight images between this calculator and -# TfLiteTensorsToDetectionsCalculator to 1. This prevents the nodes in between -# from queuing up incoming images and data excessively, which leads to increased -# latency and memory usage, unwanted in real-time mobile applications. It also -# eliminates unnecessarily computation, e.g., a transformed image produced by -# ImageTransformationCalculator may get dropped downstream if the subsequent -# TfLiteConverterCalculator or TfLiteInferenceCalculator is still busy -# processing previous inputs. +# the very first incoming image unaltered, and waits for downstream nodes +# (calculators and subgraphs) in the graph to finish their tasks before it +# passes through another image. All images that come in while waiting are +# dropped, limiting the number of in-flight images in most part of the graph to +# 1. This prevents the downstream nodes from queuing up incoming images and data +# excessively, which leads to increased latency and memory usage, unwanted in +# real-time mobile applications. It also eliminates unnecessarily computation, +# e.g., the output produced by a node may get dropped downstream if the +# subsequent nodes are still busy processing previous inputs. node { calculator: "FlowLimiterCalculator" input_stream: "input_video" - input_stream: "FINISHED:detections" + input_stream: "FINISHED:output_video" input_stream_info: { tag_index: "FINISHED" back_edge: true @@ -31,141 +29,17 @@ node { output_stream: "throttled_input_video" } -# Transforms the input image on GPU to a 128x128 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE_GPU:throttled_input_video" - output_stream: "IMAGE_GPU:transformed_input_video" - output_stream: "LETTERBOX_PADDING:letterbox_padding" - node_options: { - [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { - output_width: 128 - output_height: 128 - scale_mode: FIT - } - } -} - -# Converts the transformed input image on GPU into an image tensor stored as a -# TfLiteTensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE_GPU:transformed_input_video" - output_stream: "TENSORS_GPU:image_tensor" -} - -# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a -# vector of tensors representing, for instance, detection boxes/keypoints and -# scores. -node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS_GPU:image_tensor" - output_stream: "TENSORS_GPU:detection_tensors" - node_options: { - [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { - model_path: "mediapipe/models/face_detection_front.tflite" - } - } -} - -# Generates a single side packet containing a vector of SSD anchors based on -# the specification in the options. -node { - calculator: "SsdAnchorsCalculator" - output_side_packet: "anchors" - node_options: { - [type.googleapis.com/mediapipe.SsdAnchorsCalculatorOptions] { - num_layers: 4 - min_scale: 0.1484375 - max_scale: 0.75 - input_size_height: 128 - input_size_width: 128 - anchor_offset_x: 0.5 - anchor_offset_y: 0.5 - strides: 8 - strides: 16 - strides: 16 - strides: 16 - aspect_ratios: 1.0 - fixed_anchor_size: true - } - } -} - -# Decodes the detection tensors generated by the TensorFlow Lite model, based on -# the SSD anchors and the specification in the options, into a vector of -# detections. Each detection describes a detected object. -node { - calculator: "TfLiteTensorsToDetectionsCalculator" - input_stream: "TENSORS_GPU:detection_tensors" - input_side_packet: "ANCHORS:anchors" - output_stream: "DETECTIONS:detections" - node_options: { - [type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] { - num_classes: 1 - num_boxes: 896 - num_coords: 16 - box_coord_offset: 0 - keypoint_coord_offset: 4 - num_keypoints: 6 - num_values_per_keypoint: 2 - sigmoid_score: true - score_clipping_thresh: 100.0 - reverse_output_order: true - x_scale: 128.0 - y_scale: 128.0 - h_scale: 128.0 - w_scale: 128.0 - min_score_thresh: 0.5 - } - } -} - -# Performs non-max suppression to remove excessive detections. -node { - calculator: "NonMaxSuppressionCalculator" - input_stream: "detections" - output_stream: "filtered_detections" - node_options: { - [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { - min_suppression_threshold: 0.3 - overlap_type: INTERSECTION_OVER_UNION - algorithm: WEIGHTED - return_empty_detections: true - } - } -} - -# Maps detection label IDs to the corresponding label text ("Face"). The label -# map is provided in the label_map_path option. -node { - calculator: "DetectionLabelIdToTextCalculator" - input_stream: "filtered_detections" - output_stream: "labeled_detections" - node_options: { - [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { - label_map_path: "mediapipe/models/face_detection_front_labelmap.txt" - } - } -} - -# Adjusts detection locations (already normalized to [0.f, 1.f]) on the -# letterboxed image (after image transformation with the FIT scale mode) to the -# corresponding locations on the same image with the letterbox removed (the -# input image to the graph before image transformation). +# Subgraph that detects faces. node { - calculator: "DetectionLetterboxRemovalCalculator" - input_stream: "DETECTIONS:labeled_detections" - input_stream: "LETTERBOX_PADDING:letterbox_padding" - output_stream: "DETECTIONS:output_detections" + calculator: "FaceDetectionFrontGpu" + input_stream: "IMAGE:throttled_input_video" + output_stream: "DETECTIONS:face_detections" } # Converts the detections to drawing primitives for annotation overlay. node { calculator: "DetectionsToRenderDataCalculator" - input_stream: "DETECTIONS:output_detections" + input_stream: "DETECTIONS:face_detections" output_stream: "RENDER_DATA:render_data" node_options: { [type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] { diff --git a/mediapipe/graphs/hand_tracking/BUILD b/mediapipe/graphs/hand_tracking/BUILD index 0705905a12..ed922ba8ef 100644 --- a/mediapipe/graphs/hand_tracking/BUILD +++ b/mediapipe/graphs/hand_tracking/BUILD @@ -42,10 +42,11 @@ cc_library( name = "desktop_tflite_calculators", deps = [ ":desktop_offline_calculators", + "//mediapipe/calculators/core:constant_side_packet_calculator", "//mediapipe/calculators/core:merge_calculator", - "//mediapipe/graphs/hand_tracking/subgraphs:hand_detection_cpu", - "//mediapipe/graphs/hand_tracking/subgraphs:hand_landmark_cpu", - "//mediapipe/graphs/hand_tracking/subgraphs:renderer_cpu", + "//mediapipe/graphs/hand_tracking/subgraphs:hand_renderer_cpu", + "//mediapipe/modules/hand_landmark:hand_landmark_tracking_cpu", + "//mediapipe/modules/palm_detection:palm_detection_cpu", ], ) @@ -59,13 +60,10 @@ mediapipe_binary_graph( cc_library( name = "mobile_calculators", deps = [ + "//mediapipe/calculators/core:constant_side_packet_calculator", "//mediapipe/calculators/core:flow_limiter_calculator", - "//mediapipe/calculators/core:gate_calculator", - "//mediapipe/calculators/core:merge_calculator", - "//mediapipe/calculators/core:previous_loopback_calculator", - "//mediapipe/graphs/hand_tracking/subgraphs:hand_detection_gpu", - "//mediapipe/graphs/hand_tracking/subgraphs:hand_landmark_gpu", - "//mediapipe/graphs/hand_tracking/subgraphs:renderer_gpu", + "//mediapipe/graphs/hand_tracking/subgraphs:hand_renderer_gpu", + "//mediapipe/modules/hand_landmark:hand_landmark_tracking_gpu", ], ) @@ -76,52 +74,13 @@ mediapipe_binary_graph( deps = [":mobile_calculators"], ) -cc_library( - name = "multi_hand_desktop_tflite_calculators", - deps = [ - ":desktop_offline_calculators", - "//mediapipe/calculators/util:association_norm_rect_calculator", - "//mediapipe/calculators/util:collection_has_min_size_calculator", - "//mediapipe/graphs/hand_tracking/subgraphs:multi_hand_detection_cpu", - "//mediapipe/graphs/hand_tracking/subgraphs:multi_hand_landmark_cpu", - "//mediapipe/graphs/hand_tracking/subgraphs:multi_hand_renderer_cpu", - ], -) - -cc_library( - name = "multi_hand_mobile_calculators", - deps = [ - "//mediapipe/calculators/core:flow_limiter_calculator", - "//mediapipe/calculators/core:gate_calculator", - "//mediapipe/calculators/core:previous_loopback_calculator", - "//mediapipe/calculators/util:association_norm_rect_calculator", - "//mediapipe/calculators/util:collection_has_min_size_calculator", - "//mediapipe/graphs/hand_tracking/subgraphs:multi_hand_detection_gpu", - "//mediapipe/graphs/hand_tracking/subgraphs:multi_hand_landmark_gpu", - "//mediapipe/graphs/hand_tracking/subgraphs:multi_hand_renderer_gpu", - ], -) - -mediapipe_binary_graph( - name = "multi_hand_tracking_desktop_live_binary_graph", - graph = "multi_hand_tracking_desktop_live.pbtxt", - output_name = "multi_hand_tracking_desktop_live.binarypb", - deps = [":multi_hand_desktop_tflite_calculators"], -) - -mediapipe_binary_graph( - name = "multi_hand_tracking_mobile_gpu_binary_graph", - graph = "multi_hand_tracking_mobile.pbtxt", - output_name = "multi_hand_tracking_mobile_gpu.binarypb", - deps = [":multi_hand_mobile_calculators"], -) - cc_library( name = "detection_mobile_calculators", deps = [ "//mediapipe/calculators/core:flow_limiter_calculator", - "//mediapipe/graphs/hand_tracking/subgraphs:hand_detection_gpu", - "//mediapipe/graphs/hand_tracking/subgraphs:renderer_gpu", + "//mediapipe/calculators/util:annotation_overlay_calculator", + "//mediapipe/calculators/util:detections_to_render_data_calculator", + "//mediapipe/modules/palm_detection:palm_detection_gpu", ], ) diff --git a/mediapipe/graphs/hand_tracking/calculators/BUILD b/mediapipe/graphs/hand_tracking/calculators/BUILD index b2a8efe370..3d15861316 100644 --- a/mediapipe/graphs/hand_tracking/calculators/BUILD +++ b/mediapipe/graphs/hand_tracking/calculators/BUILD @@ -15,19 +15,3 @@ licenses(["notice"]) package(default_visibility = ["//visibility:public"]) - -cc_library( - name = "hand_landmarks_to_rect_calculator", - srcs = ["hand_landmarks_to_rect_calculator.cc"], - visibility = ["//visibility:public"], - deps = [ - "//mediapipe/framework:calculator_framework", - "//mediapipe/framework:calculator_options_cc_proto", - "//mediapipe/framework/formats:landmark_cc_proto", - "//mediapipe/framework/formats:location_data_cc_proto", - "//mediapipe/framework/formats:rect_cc_proto", - "//mediapipe/framework/port:ret_check", - "//mediapipe/framework/port:status", - ], - alwayslink = 1, -) diff --git a/mediapipe/graphs/hand_tracking/hand_detection_desktop.pbtxt b/mediapipe/graphs/hand_tracking/hand_detection_desktop.pbtxt index 813f555bf4..3edcfe7d43 100644 --- a/mediapipe/graphs/hand_tracking/hand_detection_desktop.pbtxt +++ b/mediapipe/graphs/hand_tracking/hand_detection_desktop.pbtxt @@ -16,11 +16,10 @@ node { output_stream: "VIDEO_PRESTREAM:input_video_header" } -# Performs hand detection model on the input frames. See -# hand_detection_cpu.pbtxt for the detail of the sub-graph. +# Detects palms. node { - calculator: "HandDetectionSubgraph" - input_stream: "input_video" + calculator: "PalmDetectionCpu" + input_stream: "IMAGE:input_video" output_stream: "DETECTIONS:output_detections" } diff --git a/mediapipe/graphs/hand_tracking/hand_detection_desktop_live.pbtxt b/mediapipe/graphs/hand_tracking/hand_detection_desktop_live.pbtxt index 26f8d1b461..1bbd8bc5ce 100644 --- a/mediapipe/graphs/hand_tracking/hand_detection_desktop_live.pbtxt +++ b/mediapipe/graphs/hand_tracking/hand_detection_desktop_live.pbtxt @@ -3,15 +3,16 @@ # Used in the example in # mediapipe/examples/desktop/hand_tracking:hand_detection_cpu. -# Images coming into and out of the graph. +# CPU image. (ImageFrame) input_stream: "input_video" + +# CPU image. (ImageFrame) output_stream: "output_video" -# Performs hand detection model on the input frames. See -# hand_detection_cpu.pbtxt for the detail of the sub-graph. +# Detects palms. node { - calculator: "HandDetectionSubgraph" - input_stream: "input_video" + calculator: "PalmDetectionCpu" + input_stream: "IMAGE:input_video" output_stream: "DETECTIONS:output_detections" } diff --git a/mediapipe/graphs/hand_tracking/hand_detection_mobile.pbtxt b/mediapipe/graphs/hand_tracking/hand_detection_mobile.pbtxt index df8ca6dbff..74ff5c580d 100644 --- a/mediapipe/graphs/hand_tracking/hand_detection_mobile.pbtxt +++ b/mediapipe/graphs/hand_tracking/hand_detection_mobile.pbtxt @@ -3,24 +3,26 @@ # mediapipe/examples/android/src/java/com/mediapipe/apps/handdetectiongpu and # mediapipe/examples/ios/handdetectiongpu. -# Images coming into and out of the graph. +# GPU image. (GpuBuffer) input_stream: "input_video" + +# GPU image. (GpuBuffer) output_stream: "output_video" # Throttles the images flowing downstream for flow control. It passes through -# the very first incoming image unaltered, and waits for HandDetectionSubgraph +# the very first incoming image unaltered, and waits for PalmDetectionGpu # downstream in the graph to finish its tasks before it passes through another # image. All images that come in while waiting are dropped, limiting the number -# of in-flight images in HandDetectionSubgraph to 1. This prevents the nodes in -# HandDetectionSubgraph from queuing up incoming images and data excessively, -# which leads to increased latency and memory usage, unwanted in real-time -# mobile applications. It also eliminates unnecessarily computation, e.g., the -# output produced by a node in the subgraph may get dropped downstream if the +# of in-flight images in PalmDetectionGpu to 1. This prevents the nodes in +# PalmDetectionGpu from queuing up incoming images and data excessively, which +# leads to increased latency and memory usage, unwanted in real-time mobile +# applications. It also eliminates unnecessarily computation, e.g., the output +# produced by a node in the subgraph may get dropped downstream if the # subsequent nodes are still busy processing previous inputs. node { calculator: "FlowLimiterCalculator" input_stream: "input_video" - input_stream: "FINISHED:hand_rect_from_palm_detections" + input_stream: "FINISHED:output_video" input_stream_info: { tag_index: "FINISHED" back_edge: true @@ -28,12 +30,11 @@ node { output_stream: "throttled_input_video" } -# Subgraph that detections hands (see hand_detection_gpu.pbtxt). +# Detects palms. node { - calculator: "HandDetectionSubgraph" - input_stream: "throttled_input_video" + calculator: "PalmDetectionGpu" + input_stream: "IMAGE:throttled_input_video" output_stream: "DETECTIONS:palm_detections" - output_stream: "NORM_RECT:hand_rect_from_palm_detections" } # Converts detections to drawing primitives for annotation overlay. @@ -49,25 +50,10 @@ node { } } -# Converts normalized rects to drawing primitives for annotation overlay. -node { - calculator: "RectToRenderDataCalculator" - input_stream: "NORM_RECT:hand_rect_from_palm_detections" - output_stream: "RENDER_DATA:rect_render_data" - node_options: { - [type.googleapis.com/mediapipe.RectToRenderDataCalculatorOptions] { - filled: false - color { r: 255 g: 0 b: 0 } - thickness: 4.0 - } - } -} - # Draws annotations and overlays them on top of the input images. node { calculator: "AnnotationOverlayCalculator" input_stream: "IMAGE_GPU:throttled_input_video" input_stream: "detection_render_data" - input_stream: "rect_render_data" output_stream: "IMAGE_GPU:output_video" } diff --git a/mediapipe/graphs/hand_tracking/hand_tracking_desktop.pbtxt b/mediapipe/graphs/hand_tracking/hand_tracking_desktop.pbtxt index 4e00750f5c..bc6e81cdcc 100644 --- a/mediapipe/graphs/hand_tracking/hand_tracking_desktop.pbtxt +++ b/mediapipe/graphs/hand_tracking/hand_tracking_desktop.pbtxt @@ -1,4 +1,4 @@ -# MediaPipe graph that performs hand tracking on desktop with TensorFlow Lite +# MediaPipe graph that performs hands tracking on desktop with TensorFlow Lite # on CPU. # Used in the example in # mediapipe/examples/desktop/hand_tracking:hand_tracking_tflite. @@ -16,99 +16,39 @@ node { output_stream: "VIDEO_PRESTREAM:input_video_header" } -# Caches a hand-presence decision fed back from HandLandmarkSubgraph, and upon -# the arrival of the next input image sends out the cached decision with the -# timestamp replaced by that of the input image, essentially generating a packet -# that carries the previous hand-presence decision. Note that upon the arrival -# of the very first input image, an empty packet is sent out to jump start the -# feedback loop. +# Generates side packet cotaining max number of hands to detect/track. node { - calculator: "PreviousLoopbackCalculator" - input_stream: "MAIN:input_video" - input_stream: "LOOP:hand_presence" - input_stream_info: { - tag_index: "LOOP" - back_edge: true - } - output_stream: "PREV_LOOP:prev_hand_presence" -} - -# Drops the incoming image if HandLandmarkSubgraph was able to identify hand -# presence in the previous image. Otherwise, passes the incoming image through -# to trigger a new round of hand detection in HandDetectionSubgraph. -node { - calculator: "GateCalculator" - input_stream: "input_video" - input_stream: "DISALLOW:prev_hand_presence" - output_stream: "hand_detection_input_video" - + calculator: "ConstantSidePacketCalculator" + output_side_packet: "PACKET:num_hands" node_options: { - [type.googleapis.com/mediapipe.GateCalculatorOptions] { - empty_packets_as_allow: true + [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: { + packet { int_value: 2 } } } } -# Subgraph that detections hands (see hand_detection_cpu.pbtxt). -node { - calculator: "HandDetectionSubgraph" - input_stream: "hand_detection_input_video" - output_stream: "DETECTIONS:palm_detections" - output_stream: "NORM_RECT:hand_rect_from_palm_detections" -} - -# Subgraph that localizes hand landmarks (see hand_landmark_cpu.pbtxt). +# Detects/tracks hand landmarks. node { - calculator: "HandLandmarkSubgraph" + calculator: "HandLandmarkTrackingCpu" input_stream: "IMAGE:input_video" - input_stream: "NORM_RECT:hand_rect" - output_stream: "LANDMARKS:hand_landmarks" - output_stream: "NORM_RECT:hand_rect_from_landmarks" + input_side_packet: "NUM_HANDS:num_hands" + output_stream: "LANDMARKS:landmarks" output_stream: "HANDEDNESS:handedness" - output_stream: "PRESENCE:hand_presence" -} - -# Caches a hand rectangle fed back from HandLandmarkSubgraph, and upon the -# arrival of the next input image sends out the cached rectangle with the -# timestamp replaced by that of the input image, essentially generating a packet -# that carries the previous hand rectangle. Note that upon the arrival of the -# very first input image, an empty packet is sent out to jump start the -# feedback loop. -node { - calculator: "PreviousLoopbackCalculator" - input_stream: "MAIN:input_video" - input_stream: "LOOP:hand_rect_from_landmarks" - input_stream_info: { - tag_index: "LOOP" - back_edge: true - } - output_stream: "PREV_LOOP:prev_hand_rect_from_landmarks" -} - -# Merges a stream of hand rectangles generated by HandDetectionSubgraph and that -# generated by HandLandmarkSubgraph into a single output stream by selecting -# between one of the two streams. The former is selected if the incoming packet -# is not empty, i.e., hand detection is performed on the current image by -# HandDetectionSubgraph (because HandLandmarkSubgraph could not identify hand -# presence in the previous image). Otherwise, the latter is selected, which is -# never empty because HandLandmarkSubgraphs processes all images (that went -# through FlowLimiterCaculator). -node { - calculator: "MergeCalculator" - input_stream: "hand_rect_from_palm_detections" - input_stream: "prev_hand_rect_from_landmarks" - output_stream: "hand_rect" + output_stream: "PALM_DETECTIONS:multi_palm_detections" + output_stream: "HAND_ROIS_FROM_LANDMARKS:multi_hand_rects" + output_stream: "HAND_ROIS_FROM_PALM_DETECTIONS:multi_palm_rects" } # Subgraph that renders annotations and overlays them on top of the input -# images (see renderer_cpu.pbtxt). +# images (see hand_renderer_cpu.pbtxt). node { - calculator: "RendererSubgraph" + calculator: "HandRendererSubgraph" input_stream: "IMAGE:input_video" - input_stream: "LANDMARKS:hand_landmarks" - input_stream: "NORM_RECT:hand_rect" - input_stream: "DETECTIONS:palm_detections" + input_stream: "DETECTIONS:multi_palm_detections" + input_stream: "LANDMARKS:landmarks" input_stream: "HANDEDNESS:handedness" + input_stream: "NORM_RECTS:0:multi_palm_rects" + input_stream: "NORM_RECTS:1:multi_hand_rects" output_stream: "IMAGE:output_video" } diff --git a/mediapipe/graphs/hand_tracking/hand_tracking_desktop_live.pbtxt b/mediapipe/graphs/hand_tracking/hand_tracking_desktop_live.pbtxt index 3106e90414..20de18f35a 100644 --- a/mediapipe/graphs/hand_tracking/hand_tracking_desktop_live.pbtxt +++ b/mediapipe/graphs/hand_tracking/hand_tracking_desktop_live.pbtxt @@ -1,108 +1,46 @@ -# MediaPipe graph that performs hand tracking on desktop with TensorFlow Lite -# on CPU. +# MediaPipe graph that performs hands tracking on desktop with TensorFlow +# Lite on CPU. # Used in the example in -# mediapipie/examples/desktop/hand_tracking:hand_tracking_cpu. +# mediapipe/examples/desktop/hand_tracking:hand_tracking_cpu. -# Images coming into and out of the graph. +# CPU image. (ImageFrame) input_stream: "input_video" -output_stream: "output_video" -# Hand landmarks and palm detection info. -output_stream: "palm_detections" -output_stream: "hand_landmarks" -# Caches a hand-presence decision fed back from HandLandmarkSubgraph, and upon -# the arrival of the next input image sends out the cached decision with the -# timestamp replaced by that of the input image, essentially generating a packet -# that carries the previous hand-presence decision. Note that upon the arrival -# of the very first input image, an empty packet is sent out to jump start the -# feedback loop. -node { - calculator: "PreviousLoopbackCalculator" - input_stream: "MAIN:input_video" - input_stream: "LOOP:hand_presence" - input_stream_info: { - tag_index: "LOOP" - back_edge: true - } - output_stream: "PREV_LOOP:prev_hand_presence" -} +# CPU image. (ImageFrame) +output_stream: "output_video" -# Drops the incoming image if HandLandmarkSubgraph was able to identify hand -# presence in the previous image. Otherwise, passes the incoming image through -# to trigger a new round of hand detection in HandDetectionSubgraph. +# Generates side packet cotaining max number of hands to detect/track. node { - calculator: "GateCalculator" - input_stream: "input_video" - input_stream: "DISALLOW:prev_hand_presence" - output_stream: "hand_detection_input_video" - + calculator: "ConstantSidePacketCalculator" + output_side_packet: "PACKET:num_hands" node_options: { - [type.googleapis.com/mediapipe.GateCalculatorOptions] { - empty_packets_as_allow: true + [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: { + packet { int_value: 2 } } } } -# Subgraph that detections hands (see hand_detection_cpu.pbtxt). -node { - calculator: "HandDetectionSubgraph" - input_stream: "hand_detection_input_video" - output_stream: "DETECTIONS:palm_detections" - output_stream: "NORM_RECT:hand_rect_from_palm_detections" -} - -# Subgraph that localizes hand landmarks (see hand_landmark_cpu.pbtxt). +# Detects/tracks hand landmarks. node { - calculator: "HandLandmarkSubgraph" + calculator: "HandLandmarkTrackingCpu" input_stream: "IMAGE:input_video" - input_stream: "NORM_RECT:hand_rect" - output_stream: "LANDMARKS:hand_landmarks" - output_stream: "NORM_RECT:hand_rect_from_landmarks" + input_side_packet: "NUM_HANDS:num_hands" + output_stream: "LANDMARKS:landmarks" output_stream: "HANDEDNESS:handedness" - output_stream: "PRESENCE:hand_presence" -} - -# Caches a hand rectangle fed back from HandLandmarkSubgraph, and upon the -# arrival of the next input image sends out the cached rectangle with the -# timestamp replaced by that of the input image, essentially generating a packet -# that carries the previous hand rectangle. Note that upon the arrival of the -# very first input image, an empty packet is sent out to jump start the -# feedback loop. -node { - calculator: "PreviousLoopbackCalculator" - input_stream: "MAIN:input_video" - input_stream: "LOOP:hand_rect_from_landmarks" - input_stream_info: { - tag_index: "LOOP" - back_edge: true - } - output_stream: "PREV_LOOP:prev_hand_rect_from_landmarks" -} - -# Merges a stream of hand rectangles generated by HandDetectionSubgraph and that -# generated by HandLandmarkSubgraph into a single output stream by selecting -# between one of the two streams. The former is selected if the incoming packet -# is not empty, i.e., hand detection is performed on the current image by -# HandDetectionSubgraph (because HandLandmarkSubgraph could not identify hand -# presence in the previous image). Otherwise, the latter is selected, which is -# never empty because HandLandmarkSubgraphs processes all images (that went -# through FlowLimiterCaculator). -node { - calculator: "MergeCalculator" - input_stream: "hand_rect_from_palm_detections" - input_stream: "prev_hand_rect_from_landmarks" - output_stream: "hand_rect" + output_stream: "PALM_DETECTIONS:multi_palm_detections" + output_stream: "HAND_ROIS_FROM_LANDMARKS:multi_hand_rects" + output_stream: "HAND_ROIS_FROM_PALM_DETECTIONS:multi_palm_rects" } # Subgraph that renders annotations and overlays them on top of the input -# images (see renderer_cpu.pbtxt). +# images (see hand_renderer_cpu.pbtxt). node { - calculator: "RendererSubgraph" + calculator: "HandRendererSubgraph" input_stream: "IMAGE:input_video" - input_stream: "LANDMARKS:hand_landmarks" - input_stream: "NORM_RECT:hand_rect" - input_stream: "DETECTIONS:palm_detections" + input_stream: "DETECTIONS:multi_palm_detections" + input_stream: "LANDMARKS:landmarks" input_stream: "HANDEDNESS:handedness" + input_stream: "NORM_RECTS:0:multi_palm_rects" + input_stream: "NORM_RECTS:1:multi_hand_rects" output_stream: "IMAGE:output_video" } - diff --git a/mediapipe/graphs/hand_tracking/hand_tracking_desktop_live_gpu.pbtxt b/mediapipe/graphs/hand_tracking/hand_tracking_desktop_live_gpu.pbtxt new file mode 100644 index 0000000000..4dcaac5155 --- /dev/null +++ b/mediapipe/graphs/hand_tracking/hand_tracking_desktop_live_gpu.pbtxt @@ -0,0 +1,48 @@ +# MediaPipe graph that performs multi-hand tracking with TensorFlow Lite on GPU. +# Used in the examples in +# mediapipe/examples/android/src/java/com/mediapipe/apps/handtrackinggpu. + +# GPU image. (GpuBuffer) +input_stream: "input_video" + +# GPU image. (GpuBuffer) +output_stream: "output_video" +# Collection of detected/predicted hands, each represented as a list of +# landmarks. (std::vector) +output_stream: "hand_landmarks" + +# Generates side packet cotaining max number of hands to detect/track. +node { + calculator: "ConstantSidePacketCalculator" + output_side_packet: "PACKET:num_hands" + node_options: { + [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: { + packet { int_value: 2 } + } + } +} + +# Detects/tracks hand landmarks. +node { + calculator: "HandLandmarkTrackingGpu" + input_stream: "IMAGE:input_video" + input_side_packet: "NUM_HANDS:num_hands" + output_stream: "LANDMARKS:hand_landmarks" + output_stream: "HANDEDNESS:handedness" + output_stream: "PALM_DETECTIONS:palm_detections" + output_stream: "HAND_ROIS_FROM_LANDMARKS:hand_rects_from_landmarks" + output_stream: "HAND_ROIS_FROM_PALM_DETECTIONS:hand_rects_from_palm_detections" +} + +# Subgraph that renders annotations and overlays them on top of the input +# images (see hand_renderer_gpu.pbtxt). +node { + calculator: "HandRendererSubgraph" + input_stream: "IMAGE:input_video" + input_stream: "DETECTIONS:palm_detections" + input_stream: "LANDMARKS:hand_landmarks" + input_stream: "HANDEDNESS:handedness" + input_stream: "NORM_RECTS:0:hand_rects_from_palm_detections" + input_stream: "NORM_RECTS:1:hand_rects_from_landmarks" + output_stream: "IMAGE:output_video" +} diff --git a/mediapipe/graphs/hand_tracking/hand_tracking_mobile.pbtxt b/mediapipe/graphs/hand_tracking/hand_tracking_mobile.pbtxt index 5b6f6d15cd..3672945f59 100644 --- a/mediapipe/graphs/hand_tracking/hand_tracking_mobile.pbtxt +++ b/mediapipe/graphs/hand_tracking/hand_tracking_mobile.pbtxt @@ -1,11 +1,18 @@ -# MediaPipe graph that performs hand tracking with TensorFlow Lite on GPU. +# MediaPipe graph that performs multi-hand tracking with TensorFlow Lite on GPU. # Used in the examples in -# mediapipe/examples/android/src/java/com/mediapipe/apps/handtrackinggpu and -# mediapipe/examples/ios/handtrackinggpu. +# mediapipe/examples/android/src/java/com/mediapipe/apps/handtrackinggpu. -# Images coming into and out of the graph. +# GPU image. (GpuBuffer) input_stream: "input_video" + +# Max number of hands to detect/process. (int) +input_side_packet: "num_hands" + +# GPU image. (GpuBuffer) output_stream: "output_video" +# Collection of detected/predicted hands, each represented as a list of +# landmarks. (std::vector) +output_stream: "hand_landmarks" # Throttles the images flowing downstream for flow control. It passes through # the very first incoming image unaltered, and waits for downstream nodes @@ -20,7 +27,7 @@ output_stream: "output_video" node { calculator: "FlowLimiterCalculator" input_stream: "input_video" - input_stream: "FINISHED:hand_rect" + input_stream: "FINISHED:output_video" input_stream_info: { tag_index: "FINISHED" back_edge: true @@ -28,98 +35,27 @@ node { output_stream: "throttled_input_video" } -# Caches a hand-presence decision fed back from HandLandmarkSubgraph, and upon -# the arrival of the next input image sends out the cached decision with the -# timestamp replaced by that of the input image, essentially generating a packet -# that carries the previous hand-presence decision. Note that upon the arrival -# of the very first input image, an empty packet is sent out to jump start the -# feedback loop. -node { - calculator: "PreviousLoopbackCalculator" - input_stream: "MAIN:throttled_input_video" - input_stream: "LOOP:hand_presence" - input_stream_info: { - tag_index: "LOOP" - back_edge: true - } - output_stream: "PREV_LOOP:prev_hand_presence" -} - -# Drops the incoming image if HandLandmarkSubgraph was able to identify hand -# presence in the previous image. Otherwise, passes the incoming image through -# to trigger a new round of hand detection in HandDetectionSubgraph. +# Detects/tracks hand landmarks. node { - calculator: "GateCalculator" - input_stream: "throttled_input_video" - input_stream: "DISALLOW:prev_hand_presence" - output_stream: "hand_detection_input_video" - - node_options: { - [type.googleapis.com/mediapipe.GateCalculatorOptions] { - empty_packets_as_allow: true - } - } -} - -# Subgraph that detections hands (see hand_detection_gpu.pbtxt). -node { - calculator: "HandDetectionSubgraph" - input_stream: "hand_detection_input_video" - output_stream: "DETECTIONS:palm_detections" - output_stream: "NORM_RECT:hand_rect_from_palm_detections" -} - -# Subgraph that localizes hand landmarks (see hand_landmark_gpu.pbtxt). -node { - calculator: "HandLandmarkSubgraph" + calculator: "HandLandmarkTrackingGpu" input_stream: "IMAGE:throttled_input_video" - input_stream: "NORM_RECT:hand_rect" + input_side_packet: "NUM_HANDS:num_hands" output_stream: "LANDMARKS:hand_landmarks" - output_stream: "NORM_RECT:hand_rect_from_landmarks" - output_stream: "PRESENCE:hand_presence" output_stream: "HANDEDNESS:handedness" -} - -# Caches a hand rectangle fed back from HandLandmarkSubgraph, and upon the -# arrival of the next input image sends out the cached rectangle with the -# timestamp replaced by that of the input image, essentially generating a packet -# that carries the previous hand rectangle. Note that upon the arrival of the -# very first input image, an empty packet is sent out to jump start the -# feedback loop. -node { - calculator: "PreviousLoopbackCalculator" - input_stream: "MAIN:throttled_input_video" - input_stream: "LOOP:hand_rect_from_landmarks" - input_stream_info: { - tag_index: "LOOP" - back_edge: true - } - output_stream: "PREV_LOOP:prev_hand_rect_from_landmarks" -} - -# Merges a stream of hand rectangles generated by HandDetectionSubgraph and that -# generated by HandLandmarkSubgraph into a single output stream by selecting -# between one of the two streams. The former is selected if the incoming packet -# is not empty, i.e., hand detection is performed on the current image by -# HandDetectionSubgraph (because HandLandmarkSubgraph could not identify hand -# presence in the previous image). Otherwise, the latter is selected, which is -# never empty because HandLandmarkSubgraphs processes all images (that went -# through FlowLimiterCaculator). -node { - calculator: "MergeCalculator" - input_stream: "hand_rect_from_palm_detections" - input_stream: "prev_hand_rect_from_landmarks" - output_stream: "hand_rect" + output_stream: "PALM_DETECTIONS:palm_detections" + output_stream: "HAND_ROIS_FROM_LANDMARKS:hand_rects_from_landmarks" + output_stream: "HAND_ROIS_FROM_PALM_DETECTIONS:hand_rects_from_palm_detections" } # Subgraph that renders annotations and overlays them on top of the input -# images (see renderer_gpu.pbtxt). +# images (see hand_renderer_gpu.pbtxt). node { - calculator: "RendererSubgraph" + calculator: "HandRendererSubgraph" input_stream: "IMAGE:throttled_input_video" - input_stream: "LANDMARKS:hand_landmarks" - input_stream: "NORM_RECT:hand_rect" input_stream: "DETECTIONS:palm_detections" + input_stream: "LANDMARKS:hand_landmarks" input_stream: "HANDEDNESS:handedness" + input_stream: "NORM_RECTS:0:hand_rects_from_palm_detections" + input_stream: "NORM_RECTS:1:hand_rects_from_landmarks" output_stream: "IMAGE:output_video" } diff --git a/mediapipe/graphs/hand_tracking/multi_hand_tracking_desktop.pbtxt b/mediapipe/graphs/hand_tracking/multi_hand_tracking_desktop.pbtxt deleted file mode 100644 index aa4b4ae4d8..0000000000 --- a/mediapipe/graphs/hand_tracking/multi_hand_tracking_desktop.pbtxt +++ /dev/null @@ -1,127 +0,0 @@ -# MediaPipe graph that performs multi-hand tracking on desktop with TensorFlow -# Lite on CPU. -# Used in the example in -# mediapipie/examples/desktop/hand_tracking:multi_hand_tracking_tflite. - -# max_queue_size limits the number of packets enqueued on any input stream -# by throttling inputs to the graph. This makes the graph only process one -# frame per time. -max_queue_size: 1 - -# Decodes an input video file into images and a video header. -node { - calculator: "OpenCvVideoDecoderCalculator" - input_side_packet: "INPUT_FILE_PATH:input_video_path" - output_stream: "VIDEO:input_video" - output_stream: "VIDEO_PRESTREAM:input_video_header" -} - -# Determines if an input vector of NormalizedRect has a size greater than or -# equal to the provided min_size. -node { - calculator: "NormalizedRectVectorHasMinSizeCalculator" - input_stream: "ITERABLE:prev_multi_hand_rects_from_landmarks" - output_stream: "prev_has_enough_hands" - node_options: { - [type.googleapis.com/mediapipe.CollectionHasMinSizeCalculatorOptions] { - # This value can be changed to support tracking arbitrary number of hands. - # Please also remember to modify max_vec_size in - # ClipVectorSizeCalculatorOptions in - # mediapipe/graphs/hand_tracking/subgraphs/multi_hand_detection_cpu.pbtxt - min_size: 2 - } - } -} - -# Drops the incoming image if the previous frame had at least N hands. -# Otherwise, passes the incoming image through to trigger a new round of hand -# detection in MultiHandDetectionSubgraph. -node { - calculator: "GateCalculator" - input_stream: "input_video" - input_stream: "DISALLOW:prev_has_enough_hands" - output_stream: "multi_hand_detection_input_video" - node_options: { - [type.googleapis.com/mediapipe.GateCalculatorOptions] { - empty_packets_as_allow: true - } - } -} - -# Subgraph that detections hands (see multi_hand_detection_cpu.pbtxt). -node { - calculator: "MultiHandDetectionSubgraph" - input_stream: "multi_hand_detection_input_video" - output_stream: "DETECTIONS:multi_palm_detections" - output_stream: "NORM_RECTS:multi_palm_rects" -} - -# Subgraph that localizes hand landmarks for multiple hands (see -# multi_hand_landmark.pbtxt). -node { - calculator: "MultiHandLandmarkSubgraph" - input_stream: "IMAGE:input_video" - input_stream: "NORM_RECTS:multi_hand_rects" - output_stream: "LANDMARKS:multi_hand_landmarks" - output_stream: "NORM_RECTS:multi_hand_rects_from_landmarks" -} - -# Caches a hand rectangle fed back from MultiHandLandmarkSubgraph, and upon the -# arrival of the next input image sends out the cached rectangle with the -# timestamp replaced by that of the input image, essentially generating a packet -# that carries the previous hand rectangle. Note that upon the arrival of the -# very first input image, an empty packet is sent out to jump start the -# feedback loop. -node { - calculator: "PreviousLoopbackCalculator" - input_stream: "MAIN:input_video" - input_stream: "LOOP:multi_hand_rects_from_landmarks" - input_stream_info: { - tag_index: "LOOP" - back_edge: true - } - output_stream: "PREV_LOOP:prev_multi_hand_rects_from_landmarks" -} - -# Performs association between NormalizedRect vector elements from previous -# frame and those from the current frame if MultiHandDetectionSubgraph runs. -# This calculator ensures that the output multi_hand_rects vector doesn't -# contain overlapping regions based on the specified min_similarity_threshold. -node { - calculator: "AssociationNormRectCalculator" - input_stream: "prev_multi_hand_rects_from_landmarks" - input_stream: "multi_palm_rects" - output_stream: "multi_hand_rects" - node_options: { - [type.googleapis.com/mediapipe.AssociationCalculatorOptions] { - min_similarity_threshold: 0.5 - } - } -} - -# Subgraph that renders annotations and overlays them on top of the input -# images (see multi_hand_renderer_cpu.pbtxt). -node { - calculator: "MultiHandRendererSubgraph" - input_stream: "IMAGE:input_video" - input_stream: "DETECTIONS:multi_palm_detections" - input_stream: "LANDMARKS:multi_hand_landmarks" - input_stream: "NORM_RECTS:0:multi_palm_rects" - input_stream: "NORM_RECTS:1:multi_hand_rects" - output_stream: "IMAGE:output_video" -} - -# Encodes the annotated images into a video file, adopting properties specified -# in the input video header, e.g., video framerate. -node { - calculator: "OpenCvVideoEncoderCalculator" - input_stream: "VIDEO:output_video" - input_stream: "VIDEO_PRESTREAM:input_video_header" - input_side_packet: "OUTPUT_FILE_PATH:output_video_path" - node_options: { - [type.googleapis.com/mediapipe.OpenCvVideoEncoderCalculatorOptions]: { - codec: "avc1" - video_format: "mp4" - } - } -} diff --git a/mediapipe/graphs/hand_tracking/multi_hand_tracking_desktop_live.pbtxt b/mediapipe/graphs/hand_tracking/multi_hand_tracking_desktop_live.pbtxt deleted file mode 100644 index 4820f348ec..0000000000 --- a/mediapipe/graphs/hand_tracking/multi_hand_tracking_desktop_live.pbtxt +++ /dev/null @@ -1,106 +0,0 @@ -# MediaPipe graph that performs multi-hand tracking on desktop with TensorFlow -# Lite on CPU. -# Used in the example in -# mediapipie/examples/desktop/hand_tracking:multi_hand_tracking_cpu. - -# Images coming into and out of the graph. -input_stream: "input_video" -output_stream: "output_video" -# Palm detections and hand landmarks info. -output_stream: "multi_palm_detections" -output_stream: "multi_hand_landmarks" - -# Determines if an input vector of NormalizedRect has a size greater than or -# equal to the provided min_size. -node { - calculator: "NormalizedRectVectorHasMinSizeCalculator" - input_stream: "ITERABLE:prev_multi_hand_rects_from_landmarks" - output_stream: "prev_has_enough_hands" - node_options: { - [type.googleapis.com/mediapipe.CollectionHasMinSizeCalculatorOptions] { - # This value can be changed to support tracking arbitrary number of hands. - # Please also remember to modify max_vec_size in - # ClipVectorSizeCalculatorOptions in - # mediapipe/graphs/hand_tracking/subgraphs/multi_hand_detection_gpu.pbtxt - min_size: 2 - } - } -} - -# Drops the incoming image if the previous frame had at least N hands. -# Otherwise, passes the incoming image through to trigger a new round of hand -# detection in MultiHandDetectionSubgraph. -node { - calculator: "GateCalculator" - input_stream: "input_video" - input_stream: "DISALLOW:prev_has_enough_hands" - output_stream: "multi_hand_detection_input_video" - node_options: { - [type.googleapis.com/mediapipe.GateCalculatorOptions] { - empty_packets_as_allow: true - } - } -} - -# Subgraph that detections hands (see multi_hand_detection_cpu.pbtxt). -node { - calculator: "MultiHandDetectionSubgraph" - input_stream: "multi_hand_detection_input_video" - output_stream: "DETECTIONS:multi_palm_detections" - output_stream: "NORM_RECTS:multi_palm_rects" -} - -# Subgraph that localizes hand landmarks for multiple hands (see -# multi_hand_landmark.pbtxt). -node { - calculator: "MultiHandLandmarkSubgraph" - input_stream: "IMAGE:input_video" - input_stream: "NORM_RECTS:multi_hand_rects" - output_stream: "LANDMARKS:multi_hand_landmarks" - output_stream: "NORM_RECTS:multi_hand_rects_from_landmarks" -} - -# Caches a hand rectangle fed back from MultiHandLandmarkSubgraph, and upon the -# arrival of the next input image sends out the cached rectangle with the -# timestamp replaced by that of the input image, essentially generating a packet -# that carries the previous hand rectangle. Note that upon the arrival of the -# very first input image, an empty packet is sent out to jump start the -# feedback loop. -node { - calculator: "PreviousLoopbackCalculator" - input_stream: "MAIN:input_video" - input_stream: "LOOP:multi_hand_rects_from_landmarks" - input_stream_info: { - tag_index: "LOOP" - back_edge: true - } - output_stream: "PREV_LOOP:prev_multi_hand_rects_from_landmarks" -} - -# Performs association between NormalizedRect vector elements from previous -# frame and those from the current frame if MultiHandDetectionSubgraph runs. -# This calculator ensures that the output multi_hand_rects vector doesn't -# contain overlapping regions based on the specified min_similarity_threshold. -node { - calculator: "AssociationNormRectCalculator" - input_stream: "prev_multi_hand_rects_from_landmarks" - input_stream: "multi_palm_rects" - output_stream: "multi_hand_rects" - node_options: { - [type.googleapis.com/mediapipe.AssociationCalculatorOptions] { - min_similarity_threshold: 0.5 - } - } -} - -# Subgraph that renders annotations and overlays them on top of the input -# images (see multi_hand_renderer_cpu.pbtxt). -node { - calculator: "MultiHandRendererSubgraph" - input_stream: "IMAGE:input_video" - input_stream: "DETECTIONS:multi_palm_detections" - input_stream: "LANDMARKS:multi_hand_landmarks" - input_stream: "NORM_RECTS:0:multi_palm_rects" - input_stream: "NORM_RECTS:1:multi_hand_rects" - output_stream: "IMAGE:output_video" -} diff --git a/mediapipe/graphs/hand_tracking/multi_hand_tracking_mobile.pbtxt b/mediapipe/graphs/hand_tracking/multi_hand_tracking_mobile.pbtxt deleted file mode 100644 index 87f6511778..0000000000 --- a/mediapipe/graphs/hand_tracking/multi_hand_tracking_mobile.pbtxt +++ /dev/null @@ -1,123 +0,0 @@ -# MediaPipe graph that performs multi-hand tracking with TensorFlow Lite on GPU. -# Used in the examples in -# mediapipe/examples/android/src/java/com/mediapipe/apps/multihandtrackinggpu. - -# Images coming into and out of the graph. -input_stream: "input_video" -output_stream: "output_video" - -# Throttles the images flowing downstream for flow control. It passes through -# the very first incoming image unaltered, and waits for downstream nodes -# (calculators and subgraphs) in the graph to finish their tasks before it -# passes through another image. All images that come in while waiting are -# dropped, limiting the number of in-flight images in most part of the graph to -# 1. This prevents the downstream nodes from queuing up incoming images and data -# excessively, which leads to increased latency and memory usage, unwanted in -# real-time mobile applications. It also eliminates unnecessarily computation, -# e.g., the output produced by a node may get dropped downstream if the -# subsequent nodes are still busy processing previous inputs. -node { - calculator: "FlowLimiterCalculator" - input_stream: "input_video" - input_stream: "FINISHED:multi_hand_rects" - input_stream_info: { - tag_index: "FINISHED" - back_edge: true - } - output_stream: "throttled_input_video" -} - -# Determines if an input vector of NormalizedRect has a size greater than or -# equal to the provided min_size. -node { - calculator: "NormalizedRectVectorHasMinSizeCalculator" - input_stream: "ITERABLE:prev_multi_hand_rects_from_landmarks" - output_stream: "prev_has_enough_hands" - node_options: { - [type.googleapis.com/mediapipe.CollectionHasMinSizeCalculatorOptions] { - # This value can be changed to support tracking arbitrary number of hands. - # Please also remember to modify max_vec_size in - # ClipVectorSizeCalculatorOptions in - # mediapipe/graphs/hand_tracking/subgraphs/multi_hand_detection_gpu.pbtxt - min_size: 2 - } - } -} - -# Drops the incoming image if the previous frame had at least N hands. -# Otherwise, passes the incoming image through to trigger a new round of hand -# detection in MultiHandDetectionSubgraph. -node { - calculator: "GateCalculator" - input_stream: "throttled_input_video" - input_stream: "DISALLOW:prev_has_enough_hands" - output_stream: "multi_hand_detection_input_video" - node_options: { - [type.googleapis.com/mediapipe.GateCalculatorOptions] { - empty_packets_as_allow: true - } - } -} - -# Subgraph that detections hands (see multi_hand_detection_gpu.pbtxt). -node { - calculator: "MultiHandDetectionSubgraph" - input_stream: "multi_hand_detection_input_video" - output_stream: "DETECTIONS:multi_palm_detections" - output_stream: "NORM_RECTS:multi_palm_rects" -} - -# Subgraph that localizes hand landmarks for multiple hands (see -# multi_hand_landmark.pbtxt). -node { - calculator: "MultiHandLandmarkSubgraph" - input_stream: "IMAGE:throttled_input_video" - input_stream: "NORM_RECTS:multi_hand_rects" - output_stream: "LANDMARKS:multi_hand_landmarks" - output_stream: "NORM_RECTS:multi_hand_rects_from_landmarks" -} - -# Caches a hand rectangle fed back from MultiHandLandmarkSubgraph, and upon the -# arrival of the next input image sends out the cached rectangle with the -# timestamp replaced by that of the input image, essentially generating a packet -# that carries the previous hand rectangle. Note that upon the arrival of the -# very first input image, an empty packet is sent out to jump start the -# feedback loop. -node { - calculator: "PreviousLoopbackCalculator" - input_stream: "MAIN:throttled_input_video" - input_stream: "LOOP:multi_hand_rects_from_landmarks" - input_stream_info: { - tag_index: "LOOP" - back_edge: true - } - output_stream: "PREV_LOOP:prev_multi_hand_rects_from_landmarks" -} - -# Performs association between NormalizedRect vector elements from previous -# frame and those from the current frame if MultiHandDetectionSubgraph runs. -# This calculator ensures that the output multi_hand_rects vector doesn't -# contain overlapping regions based on the specified min_similarity_threshold. -node { - calculator: "AssociationNormRectCalculator" - input_stream: "prev_multi_hand_rects_from_landmarks" - input_stream: "multi_palm_rects" - output_stream: "multi_hand_rects" - node_options: { - [type.googleapis.com/mediapipe.AssociationCalculatorOptions] { - min_similarity_threshold: 0.5 - } - } -} - -# Subgraph that renders annotations and overlays them on top of the input -# images (see multi_hand_renderer_gpu.pbtxt). -node { - calculator: "MultiHandRendererSubgraph" - input_stream: "IMAGE:throttled_input_video" - input_stream: "DETECTIONS:multi_palm_detections" - input_stream: "LANDMARKS:multi_hand_landmarks" - input_stream: "NORM_RECTS:0:multi_palm_rects" - input_stream: "NORM_RECTS:1:multi_hand_rects" - output_stream: "IMAGE:output_video" -} diff --git a/mediapipe/graphs/hand_tracking/subgraphs/BUILD b/mediapipe/graphs/hand_tracking/subgraphs/BUILD index da57d14475..f16a6db3ad 100644 --- a/mediapipe/graphs/hand_tracking/subgraphs/BUILD +++ b/mediapipe/graphs/hand_tracking/subgraphs/BUILD @@ -22,94 +22,16 @@ licenses(["notice"]) package(default_visibility = ["//visibility:public"]) mediapipe_simple_subgraph( - name = "hand_detection_cpu", - graph = "hand_detection_cpu.pbtxt", - register_as = "HandDetectionSubgraph", - deps = [ - "//mediapipe/calculators/image:image_properties_calculator", - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:ssd_anchors_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", - "//mediapipe/calculators/util:detection_label_id_to_text_calculator", - "//mediapipe/calculators/util:detection_letterbox_removal_calculator", - "//mediapipe/calculators/util:detections_to_rects_calculator", - "//mediapipe/calculators/util:detections_to_render_data_calculator", - "//mediapipe/calculators/util:non_max_suppression_calculator", - "//mediapipe/calculators/util:rect_transformation_calculator", - ], -) - -mediapipe_simple_subgraph( - name = "multi_hand_detection_cpu", - graph = "multi_hand_detection_cpu.pbtxt", - register_as = "MultiHandDetectionSubgraph", + name = "hand_renderer_cpu", + graph = "hand_renderer_cpu.pbtxt", + register_as = "HandRendererSubgraph", deps = [ "//mediapipe/calculators/core:begin_loop_calculator", - "//mediapipe/calculators/core:clip_vector_size_calculator", "//mediapipe/calculators/core:end_loop_calculator", - "//mediapipe/calculators/image:image_properties_calculator", - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:ssd_anchors_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", - "//mediapipe/calculators/util:detection_label_id_to_text_calculator", - "//mediapipe/calculators/util:detection_letterbox_removal_calculator", - "//mediapipe/calculators/util:detections_to_rects_calculator", - "//mediapipe/calculators/util:non_max_suppression_calculator", - "//mediapipe/calculators/util:rect_transformation_calculator", - ], -) - -mediapipe_simple_subgraph( - name = "hand_landmark_cpu", - graph = "hand_landmark_cpu.pbtxt", - register_as = "HandLandmarkSubgraph", - deps = [ - "//mediapipe/calculators/core:split_normalized_landmark_list_calculator", + "//mediapipe/calculators/core:gate_calculator", "//mediapipe/calculators/core:split_vector_calculator", - "//mediapipe/calculators/image:image_cropping_calculator", - "//mediapipe/calculators/image:image_properties_calculator", - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_classification_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_floats_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_landmarks_calculator", - "//mediapipe/calculators/util:detections_to_rects_calculator", - "//mediapipe/calculators/util:landmark_letterbox_removal_calculator", - "//mediapipe/calculators/util:landmark_projection_calculator", - "//mediapipe/calculators/util:landmarks_to_detection_calculator", - "//mediapipe/calculators/util:landmarks_to_render_data_calculator", - "//mediapipe/calculators/util:rect_transformation_calculator", - "//mediapipe/calculators/util:thresholding_calculator", - "//mediapipe/graphs/hand_tracking/calculators:hand_landmarks_to_rect_calculator", - ], -) - -mediapipe_simple_subgraph( - name = "multi_hand_landmark_cpu", - graph = "multi_hand_landmark.pbtxt", - register_as = "MultiHandLandmarkSubgraph", - deps = [ - ":hand_landmark_cpu", - "//mediapipe/calculators/core:begin_loop_calculator", - "//mediapipe/calculators/core:end_loop_calculator", - "//mediapipe/calculators/util:filter_collection_calculator", - ], -) - -mediapipe_simple_subgraph( - name = "renderer_cpu", - graph = "renderer_cpu.pbtxt", - register_as = "RendererSubgraph", - deps = [ "//mediapipe/calculators/util:annotation_overlay_calculator", + "//mediapipe/calculators/util:collection_has_min_size_calculator", "//mediapipe/calculators/util:detections_to_render_data_calculator", "//mediapipe/calculators/util:labels_to_render_data_calculator", "//mediapipe/calculators/util:landmarks_to_render_data_calculator", @@ -118,123 +40,19 @@ mediapipe_simple_subgraph( ) mediapipe_simple_subgraph( - name = "multi_hand_renderer_cpu", - graph = "multi_hand_renderer_cpu.pbtxt", - register_as = "MultiHandRendererSubgraph", - deps = [ - "//mediapipe/calculators/core:begin_loop_calculator", - "//mediapipe/calculators/core:end_loop_calculator", - "//mediapipe/calculators/util:annotation_overlay_calculator", - "//mediapipe/calculators/util:detections_to_render_data_calculator", - "//mediapipe/calculators/util:landmarks_to_render_data_calculator", - "//mediapipe/calculators/util:rect_to_render_data_calculator", - ], -) - -mediapipe_simple_subgraph( - name = "hand_detection_gpu", - graph = "hand_detection_gpu.pbtxt", - register_as = "HandDetectionSubgraph", - deps = [ - "//mediapipe/calculators/image:image_properties_calculator", - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:ssd_anchors_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", - "//mediapipe/calculators/util:detection_label_id_to_text_calculator", - "//mediapipe/calculators/util:detection_letterbox_removal_calculator", - "//mediapipe/calculators/util:detections_to_rects_calculator", - "//mediapipe/calculators/util:non_max_suppression_calculator", - "//mediapipe/calculators/util:rect_transformation_calculator", - ], -) - -mediapipe_simple_subgraph( - name = "multi_hand_detection_gpu", - graph = "multi_hand_detection_gpu.pbtxt", - register_as = "MultiHandDetectionSubgraph", + name = "hand_renderer_gpu", + graph = "hand_renderer_gpu.pbtxt", + register_as = "HandRendererSubgraph", deps = [ "//mediapipe/calculators/core:begin_loop_calculator", - "//mediapipe/calculators/core:clip_vector_size_calculator", "//mediapipe/calculators/core:end_loop_calculator", - "//mediapipe/calculators/image:image_properties_calculator", - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:ssd_anchors_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", - "//mediapipe/calculators/util:detection_label_id_to_text_calculator", - "//mediapipe/calculators/util:detection_letterbox_removal_calculator", - "//mediapipe/calculators/util:detections_to_rects_calculator", - "//mediapipe/calculators/util:non_max_suppression_calculator", - "//mediapipe/calculators/util:rect_transformation_calculator", - ], -) - -mediapipe_simple_subgraph( - name = "hand_landmark_gpu", - graph = "hand_landmark_gpu.pbtxt", - register_as = "HandLandmarkSubgraph", - deps = [ - "//mediapipe/calculators/core:split_normalized_landmark_list_calculator", + "//mediapipe/calculators/core:gate_calculator", "//mediapipe/calculators/core:split_vector_calculator", - "//mediapipe/calculators/image:image_cropping_calculator", - "//mediapipe/calculators/image:image_properties_calculator", - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_classification_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_floats_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_landmarks_calculator", - "//mediapipe/calculators/util:detections_to_rects_calculator", - "//mediapipe/calculators/util:landmark_letterbox_removal_calculator", - "//mediapipe/calculators/util:landmark_projection_calculator", - "//mediapipe/calculators/util:landmarks_to_detection_calculator", - "//mediapipe/calculators/util:rect_transformation_calculator", - "//mediapipe/calculators/util:thresholding_calculator", - "//mediapipe/graphs/hand_tracking/calculators:hand_landmarks_to_rect_calculator", - ], -) - -mediapipe_simple_subgraph( - name = "multi_hand_landmark_gpu", - graph = "multi_hand_landmark.pbtxt", - register_as = "MultiHandLandmarkSubgraph", - deps = [ - ":hand_landmark_gpu", - "//mediapipe/calculators/core:begin_loop_calculator", - "//mediapipe/calculators/core:end_loop_calculator", - "//mediapipe/calculators/util:filter_collection_calculator", - ], -) - -mediapipe_simple_subgraph( - name = "renderer_gpu", - graph = "renderer_gpu.pbtxt", - register_as = "RendererSubgraph", - deps = [ "//mediapipe/calculators/util:annotation_overlay_calculator", + "//mediapipe/calculators/util:collection_has_min_size_calculator", "//mediapipe/calculators/util:detections_to_render_data_calculator", "//mediapipe/calculators/util:labels_to_render_data_calculator", "//mediapipe/calculators/util:landmarks_to_render_data_calculator", "//mediapipe/calculators/util:rect_to_render_data_calculator", ], ) - -mediapipe_simple_subgraph( - name = "multi_hand_renderer_gpu", - graph = "multi_hand_renderer_gpu.pbtxt", - register_as = "MultiHandRendererSubgraph", - deps = [ - "//mediapipe/calculators/core:begin_loop_calculator", - "//mediapipe/calculators/core:end_loop_calculator", - "//mediapipe/calculators/util:annotation_overlay_calculator", - "//mediapipe/calculators/util:detections_to_render_data_calculator", - "//mediapipe/calculators/util:landmarks_to_render_data_calculator", - "//mediapipe/calculators/util:rect_to_render_data_calculator", - ], -) diff --git a/mediapipe/graphs/hand_tracking/subgraphs/hand_detection_cpu.pbtxt b/mediapipe/graphs/hand_tracking/subgraphs/hand_detection_cpu.pbtxt deleted file mode 100644 index 65c7d162fc..0000000000 --- a/mediapipe/graphs/hand_tracking/subgraphs/hand_detection_cpu.pbtxt +++ /dev/null @@ -1,193 +0,0 @@ -# MediaPipe hand detection subgraph. - -type: "HandDetectionSubgraph" - -input_stream: "input_video" -output_stream: "DETECTIONS:palm_detections" -output_stream: "NORM_RECT:hand_rect_from_palm_detections" - -# Transforms the input image on CPU to a 256x256 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE:input_video" - output_stream: "IMAGE:transformed_input_video" - output_stream: "LETTERBOX_PADDING:letterbox_padding" - node_options: { - [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { - output_width: 256 - output_height: 256 - scale_mode: FIT - } - } -} - -# Generates a single side packet containing a TensorFlow Lite op resolver that -# supports custom ops needed by the model used in this graph. -node { - calculator: "TfLiteCustomOpResolverCalculator" - output_side_packet: "op_resolver" -} - -# Converts the transformed input image on CPU into an image tensor as a -# TfLiteTensor. The zero_center option is set to true to normalize the -# pixel values to [-1.f, 1.f] as opposed to [0.f, 1.f]. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE:transformed_input_video" - output_stream: "TENSORS:image_tensor" -} - -# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a -# vector of tensors representing, for instance, detection boxes/keypoints and -# scores. -node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS:image_tensor" - output_stream: "TENSORS:detection_tensors" - input_side_packet: "CUSTOM_OP_RESOLVER:op_resolver" - node_options: { - [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { - model_path: "mediapipe/models/palm_detection.tflite" - } - } -} - -# Generates a single side packet containing a vector of SSD anchors based on -# the specification in the options. -node { - calculator: "SsdAnchorsCalculator" - output_side_packet: "anchors" - node_options: { - [type.googleapis.com/mediapipe.SsdAnchorsCalculatorOptions] { - num_layers: 5 - min_scale: 0.1171875 - max_scale: 0.75 - input_size_height: 256 - input_size_width: 256 - anchor_offset_x: 0.5 - anchor_offset_y: 0.5 - strides: 8 - strides: 16 - strides: 32 - strides: 32 - strides: 32 - aspect_ratios: 1.0 - fixed_anchor_size: true - } - } -} - -# Decodes the detection tensors generated by the TensorFlow Lite model, based on -# the SSD anchors and the specification in the options, into a vector of -# detections. Each detection describes a detected object. -node { - calculator: "TfLiteTensorsToDetectionsCalculator" - input_stream: "TENSORS:detection_tensors" - input_side_packet: "ANCHORS:anchors" - output_stream: "DETECTIONS:detections" - node_options: { - [type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] { - num_classes: 1 - num_boxes: 2944 - num_coords: 18 - box_coord_offset: 0 - keypoint_coord_offset: 4 - num_keypoints: 7 - num_values_per_keypoint: 2 - sigmoid_score: true - score_clipping_thresh: 100.0 - reverse_output_order: true - - x_scale: 256.0 - y_scale: 256.0 - h_scale: 256.0 - w_scale: 256.0 - min_score_thresh: 0.5 - } - } -} - -# Performs non-max suppression to remove excessive detections. -node { - calculator: "NonMaxSuppressionCalculator" - input_stream: "detections" - output_stream: "filtered_detections" - node_options: { - [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { - min_suppression_threshold: 0.3 - min_score_threshold: 0.5 - overlap_type: INTERSECTION_OVER_UNION - algorithm: WEIGHTED - return_empty_detections: true - } - } -} - -# Maps detection label IDs to the corresponding label text. The label map is -# provided in the label_map_path option. -node { - calculator: "DetectionLabelIdToTextCalculator" - input_stream: "filtered_detections" - output_stream: "labeled_detections" - node_options: { - [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { - label_map_path: "mediapipe/models/palm_detection_labelmap.txt" - } - } -} - -# Adjusts detection locations (already normalized to [0.f, 1.f]) on the -# letterboxed image (after image transformation with the FIT scale mode) to the -# corresponding locations on the same image with the letterbox removed (the -# input image to the graph before image transformation). -node { - calculator: "DetectionLetterboxRemovalCalculator" - input_stream: "DETECTIONS:labeled_detections" - input_stream: "LETTERBOX_PADDING:letterbox_padding" - output_stream: "DETECTIONS:palm_detections" -} - -# Extracts image size from the input images. -node { - calculator: "ImagePropertiesCalculator" - input_stream: "IMAGE:input_video" - output_stream: "SIZE:image_size" -} - -# Converts results of palm detection into a rectangle (normalized by image size) -# that encloses the palm and is rotated such that the line connecting center of -# the wrist and MCP of the middle finger is aligned with the Y-axis of the -# rectangle. -node { - calculator: "DetectionsToRectsCalculator" - input_stream: "DETECTIONS:palm_detections" - input_stream: "IMAGE_SIZE:image_size" - output_stream: "NORM_RECT:palm_rect" - node_options: { - [type.googleapis.com/mediapipe.DetectionsToRectsCalculatorOptions] { - rotation_vector_start_keypoint_index: 0 # Center of wrist. - rotation_vector_end_keypoint_index: 2 # MCP of middle finger. - rotation_vector_target_angle_degrees: 90 - output_zero_rect_for_empty_detections: true - } - } -} - -# Expands and shifts the rectangle that contains the palm so that it's likely -# to cover the entire hand. -node { - calculator: "RectTransformationCalculator" - input_stream: "NORM_RECT:palm_rect" - input_stream: "IMAGE_SIZE:image_size" - output_stream: "hand_rect_from_palm_detections" - node_options: { - [type.googleapis.com/mediapipe.RectTransformationCalculatorOptions] { - scale_x: 2.6 - scale_y: 2.6 - shift_y: -0.5 - square_long: true - } - } -} diff --git a/mediapipe/graphs/hand_tracking/subgraphs/hand_detection_gpu.pbtxt b/mediapipe/graphs/hand_tracking/subgraphs/hand_detection_gpu.pbtxt deleted file mode 100644 index 8332860667..0000000000 --- a/mediapipe/graphs/hand_tracking/subgraphs/hand_detection_gpu.pbtxt +++ /dev/null @@ -1,197 +0,0 @@ -# MediaPipe hand detection subgraph. - -type: "HandDetectionSubgraph" - -input_stream: "input_video" -output_stream: "DETECTIONS:palm_detections" -output_stream: "NORM_RECT:hand_rect_from_palm_detections" - -# Transforms the input image on GPU to a 256x256 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE_GPU:input_video" - output_stream: "IMAGE_GPU:transformed_input_video" - output_stream: "LETTERBOX_PADDING:letterbox_padding" - node_options: { - [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { - output_width: 256 - output_height: 256 - scale_mode: FIT - } - } -} - -# Generates a single side packet containing a TensorFlow Lite op resolver that -# supports custom ops needed by the model used in this graph. -node { - calculator: "TfLiteCustomOpResolverCalculator" - output_side_packet: "opresolver" - node_options: { - [type.googleapis.com/mediapipe.TfLiteCustomOpResolverCalculatorOptions] { - use_gpu: true - } - } -} - -# Converts the transformed input image on GPU into an image tensor stored as a -# TfLiteTensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE_GPU:transformed_input_video" - output_stream: "TENSORS_GPU:image_tensor" -} - -# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a -# vector of tensors representing, for instance, detection boxes/keypoints and -# scores. -node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS_GPU:image_tensor" - output_stream: "TENSORS_GPU:detection_tensors" - input_side_packet: "CUSTOM_OP_RESOLVER:opresolver" - node_options: { - [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { - model_path: "mediapipe/models/palm_detection.tflite" - use_gpu: true - } - } -} - -# Generates a single side packet containing a vector of SSD anchors based on -# the specification in the options. -node { - calculator: "SsdAnchorsCalculator" - output_side_packet: "anchors" - node_options: { - [type.googleapis.com/mediapipe.SsdAnchorsCalculatorOptions] { - num_layers: 5 - min_scale: 0.1171875 - max_scale: 0.75 - input_size_height: 256 - input_size_width: 256 - anchor_offset_x: 0.5 - anchor_offset_y: 0.5 - strides: 8 - strides: 16 - strides: 32 - strides: 32 - strides: 32 - aspect_ratios: 1.0 - fixed_anchor_size: true - } - } -} - -# Decodes the detection tensors generated by the TensorFlow Lite model, based on -# the SSD anchors and the specification in the options, into a vector of -# detections. Each detection describes a detected object. -node { - calculator: "TfLiteTensorsToDetectionsCalculator" - input_stream: "TENSORS_GPU:detection_tensors" - input_side_packet: "ANCHORS:anchors" - output_stream: "DETECTIONS:detections" - node_options: { - [type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] { - num_classes: 1 - num_boxes: 2944 - num_coords: 18 - box_coord_offset: 0 - keypoint_coord_offset: 4 - num_keypoints: 7 - num_values_per_keypoint: 2 - sigmoid_score: true - score_clipping_thresh: 100.0 - reverse_output_order: true - - x_scale: 256.0 - y_scale: 256.0 - h_scale: 256.0 - w_scale: 256.0 - min_score_thresh: 0.7 - } - } -} - -# Performs non-max suppression to remove excessive detections. -node { - calculator: "NonMaxSuppressionCalculator" - input_stream: "detections" - output_stream: "filtered_detections" - node_options: { - [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { - min_suppression_threshold: 0.3 - overlap_type: INTERSECTION_OVER_UNION - algorithm: WEIGHTED - return_empty_detections: true - } - } -} - -# Maps detection label IDs to the corresponding label text ("Palm"). The label -# map is provided in the label_map_path option. -node { - calculator: "DetectionLabelIdToTextCalculator" - input_stream: "filtered_detections" - output_stream: "labeled_detections" - node_options: { - [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { - label_map_path: "mediapipe/models/palm_detection_labelmap.txt" - } - } -} - -# Adjusts detection locations (already normalized to [0.f, 1.f]) on the -# letterboxed image (after image transformation with the FIT scale mode) to the -# corresponding locations on the same image with the letterbox removed (the -# input image to the graph before image transformation). -node { - calculator: "DetectionLetterboxRemovalCalculator" - input_stream: "DETECTIONS:labeled_detections" - input_stream: "LETTERBOX_PADDING:letterbox_padding" - output_stream: "DETECTIONS:palm_detections" -} - -# Extracts image size from the input images. -node { - calculator: "ImagePropertiesCalculator" - input_stream: "IMAGE_GPU:input_video" - output_stream: "SIZE:image_size" -} - -# Converts results of palm detection into a rectangle (normalized by image size) -# that encloses the palm and is rotated such that the line connecting center of -# the wrist and MCP of the middle finger is aligned with the Y-axis of the -# rectangle. -node { - calculator: "DetectionsToRectsCalculator" - input_stream: "DETECTIONS:palm_detections" - input_stream: "IMAGE_SIZE:image_size" - output_stream: "NORM_RECT:palm_rect" - node_options: { - [type.googleapis.com/mediapipe.DetectionsToRectsCalculatorOptions] { - rotation_vector_start_keypoint_index: 0 # Center of wrist. - rotation_vector_end_keypoint_index: 2 # MCP of middle finger. - rotation_vector_target_angle_degrees: 90 - output_zero_rect_for_empty_detections: true - } - } -} - -# Expands and shifts the rectangle that contains the palm so that it's likely -# to cover the entire hand. -node { - calculator: "RectTransformationCalculator" - input_stream: "NORM_RECT:palm_rect" - input_stream: "IMAGE_SIZE:image_size" - output_stream: "hand_rect_from_palm_detections" - node_options: { - [type.googleapis.com/mediapipe.RectTransformationCalculatorOptions] { - scale_x: 2.6 - scale_y: 2.6 - shift_y: -0.5 - square_long: true - } - } -} diff --git a/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_cpu.pbtxt b/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_cpu.pbtxt deleted file mode 100644 index 9d42ddfbf5..0000000000 --- a/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_cpu.pbtxt +++ /dev/null @@ -1,226 +0,0 @@ -# MediaPipe hand landmark localization subgraph. - -type: "HandLandmarkSubgraph" - -input_stream: "IMAGE:input_video" -input_stream: "NORM_RECT:hand_rect" -output_stream: "LANDMARKS:hand_landmarks" -output_stream: "NORM_RECT:hand_rect_for_next_frame" -output_stream: "PRESENCE:hand_presence" -output_stream: "PRESENCE_SCORE:hand_presence_score" -output_stream: "HANDEDNESS:handedness" - -# Crops the rectangle that contains a hand from the input image. -node { - calculator: "ImageCroppingCalculator" - input_stream: "IMAGE:input_video" - input_stream: "NORM_RECT:hand_rect" - output_stream: "IMAGE:hand_image" - node_options: { - [type.googleapis.com/mediapipe.ImageCroppingCalculatorOptions] { - border_mode: BORDER_REPLICATE - } - } -} - -# Transforms the input image on CPU to a 256x256 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE:hand_image" - output_stream: "IMAGE:transformed_input_video" - output_stream: "LETTERBOX_PADDING:letterbox_padding" - node_options: { - [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { - output_width: 256 - output_height: 256 - scale_mode: FIT - } - } -} - -# Generates a single side packet containing a TensorFlow Lite op resolver that -# supports custom ops needed by the model used in this graph. -node { - calculator: "TfLiteCustomOpResolverCalculator" - output_side_packet: "op_resolver" -} - -# Converts the transformed input image on CPU into an image tensor stored in -# TfliteTensor. The zero_center option is set to false to normalize the -# pixel values to [0.f, 1.f]. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE:transformed_input_video" - output_stream: "TENSORS:image_tensor" - node_options: { - [type.googleapis.com/mediapipe.TfLiteConverterCalculatorOptions] { - zero_center: false - } - } -} - -# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a -# vector of tensors representing, for instance, detection boxes/keypoints and -# scores. -node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS:image_tensor" - output_stream: "TENSORS:output_tensors" - input_side_packet: "CUSTOM_OP_RESOLVER:op_resolver" - node_options: { - [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { - model_path: "mediapipe/models/hand_landmark.tflite" - } - } -} - -# Splits a vector of TFLite tensors to multiple vectors according to the ranges -# specified in option. -node { - calculator: "SplitTfLiteTensorVectorCalculator" - input_stream: "output_tensors" - output_stream: "landmark_tensors" - output_stream: "hand_flag_tensor" - output_stream: "handedness_tensor" - node_options: { - [type.googleapis.com/mediapipe.SplitVectorCalculatorOptions] { - ranges: { begin: 0 end: 1 } - ranges: { begin: 1 end: 2 } - ranges: { begin: 2 end: 3 } - } - } -} - -# Converts the hand-flag tensor into a float that represents the confidence -# score of hand presence. -node { - calculator: "TfLiteTensorsToFloatsCalculator" - input_stream: "TENSORS:hand_flag_tensor" - output_stream: "FLOAT:hand_presence_score" -} - -# Converts the handedness tensor into a float as the score of the handedness -# binary classifciation. -node { - calculator: "TfLiteTensorsToClassificationCalculator" - input_stream: "TENSORS:handedness_tensor" - output_stream: "CLASSIFICATIONS:handedness" - node_options: { - [type.googleapis.com/mediapipe.TfLiteTensorsToClassificationCalculatorOptions] { - top_k: 1 - label_map_path: "mediapipe/models/handedness.txt" - binary_classification: true - } - } -} - -# Applies a threshold to the confidence score to determine whether a hand is -# present. -node { - calculator: "ThresholdingCalculator" - input_stream: "FLOAT:hand_presence_score" - output_stream: "FLAG:hand_presence" - node_options: { - [type.googleapis.com/mediapipe.ThresholdingCalculatorOptions] { - threshold: 0.5 - } - } -} - -# Decodes the landmark tensors into a list of landmarks, where the landmark -# coordinates are normalized by the size of the input image to the model. -node { - calculator: "TfLiteTensorsToLandmarksCalculator" - input_stream: "TENSORS:landmark_tensors" - output_stream: "NORM_LANDMARKS:landmarks" - node_options: { - [type.googleapis.com/mediapipe.TfLiteTensorsToLandmarksCalculatorOptions] { - num_landmarks: 21 - input_image_width: 256 - input_image_height: 256 - # The additional scaling factor is used to account for the Z coordinate - # distribution in the training data. - normalize_z: 0.4 - } - } -} - -# Adjusts landmarks (already normalized to [0.f, 1.f]) on the letterboxed hand -# image (after image transformation with the FIT scale mode) to the -# corresponding locations on the same image with the letterbox removed (hand -# image before image transformation). -node { - calculator: "LandmarkLetterboxRemovalCalculator" - input_stream: "LANDMARKS:landmarks" - input_stream: "LETTERBOX_PADDING:letterbox_padding" - output_stream: "LANDMARKS:scaled_landmarks" -} - -# Projects the landmarks from the cropped hand image to the corresponding -# locations on the full image before cropping (input to the graph). -node { - calculator: "LandmarkProjectionCalculator" - input_stream: "NORM_LANDMARKS:scaled_landmarks" - input_stream: "NORM_RECT:hand_rect" - output_stream: "NORM_LANDMARKS:hand_landmarks" -} - -# Extracts image size from the input images. -node { - calculator: "ImagePropertiesCalculator" - input_stream: "IMAGE:input_video" - output_stream: "SIZE:image_size" -} - -# Extracts a subset of the hand landmarks that are relatively more stable across -# frames (e.g. comparing to finger tips) for computing the bounding box. The box -# will later be expanded to contain the entire hand. In this approach, it is -# more robust to drastically changing hand size. -# The landmarks extracted are: wrist, MCP/PIP of five fingers. -node { - calculator: "SplitNormalizedLandmarkListCalculator" - input_stream: "hand_landmarks" - output_stream: "partial_landmarks" - node_options: { - [type.googleapis.com/mediapipe.SplitVectorCalculatorOptions] { - ranges: { begin: 0 end: 4 } - ranges: { begin: 5 end: 7 } - ranges: { begin: 9 end: 11 } - ranges: { begin: 13 end: 15 } - ranges: { begin: 17 end: 19 } - combine_outputs: true - } - } -} - -# Converts the hand landmarks into a rectangle (normalized by image size) -# that encloses the hand. The calculator uses a subset of all hand landmarks -# extracted from SplitNormalizedLandmarkListCalculator above to -# calculate the bounding box and the rotation of the output rectangle. Please -# see the comments in the calculator for more detail. -node { - calculator: "HandLandmarksToRectCalculator" - input_stream: "NORM_LANDMARKS:partial_landmarks" - input_stream: "IMAGE_SIZE:image_size" - output_stream: "NORM_RECT:hand_rect_from_landmarks" -} - -# Expands the hand rectangle so that the box contains the entire hand and it's -# big enough so that it's likely to still contain the hand even with some motion -# in the next video frame . -node { - calculator: "RectTransformationCalculator" - input_stream: "NORM_RECT:hand_rect_from_landmarks" - input_stream: "IMAGE_SIZE:image_size" - output_stream: "hand_rect_for_next_frame" - node_options: { - [type.googleapis.com/mediapipe.RectTransformationCalculatorOptions] { - scale_x: 2.0 - scale_y: 2.0 - shift_y: -0.1 - square_long: true - } - } -} diff --git a/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_gpu.pbtxt b/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_gpu.pbtxt deleted file mode 100644 index b3f316a413..0000000000 --- a/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_gpu.pbtxt +++ /dev/null @@ -1,230 +0,0 @@ -# MediaPipe hand landmark localization subgraph. - -type: "HandLandmarkSubgraph" - -input_stream: "IMAGE:input_video" -input_stream: "NORM_RECT:hand_rect" -output_stream: "LANDMARKS:hand_landmarks" -output_stream: "NORM_RECT:hand_rect_for_next_frame" -output_stream: "PRESENCE:hand_presence" -output_stream: "PRESENCE_SCORE:hand_presence_score" -output_stream: "HANDEDNESS:handedness" - -# Crops the rectangle that contains a hand from the input image. -node { - calculator: "ImageCroppingCalculator" - input_stream: "IMAGE_GPU:input_video" - input_stream: "NORM_RECT:hand_rect" - output_stream: "IMAGE_GPU:hand_image" - node_options: { - [type.googleapis.com/mediapipe.ImageCroppingCalculatorOptions] { - border_mode: BORDER_REPLICATE - } - } -} - -# Transforms the input image on GPU to a 256x256 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE_GPU:hand_image" - output_stream: "IMAGE_GPU:transformed_hand_image" - output_stream: "LETTERBOX_PADDING:letterbox_padding" - node_options: { - [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { - output_width: 256 - output_height: 256 - scale_mode: FIT - } - } -} - -# Generates a single side packet containing a TensorFlow Lite op resolver that -# supports custom ops needed by the model used in this graph. -node { - calculator: "TfLiteCustomOpResolverCalculator" - output_side_packet: "op_resolver" - node_options: { - [type.googleapis.com/mediapipe.TfLiteCustomOpResolverCalculatorOptions] { - use_gpu: true - } - } -} - -# Converts the transformed input image on GPU into an image tensor stored as a -# TfLiteTensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE_GPU:transformed_hand_image" - output_stream: "TENSORS_GPU:image_tensor" - node_options: { - [type.googleapis.com/mediapipe.TfLiteConverterCalculatorOptions] { - zero_center: false - } - } -} - -# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a -# vector of tensors representing, for instance, detection boxes/keypoints and -# scores. -node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS_GPU:image_tensor" - output_stream: "TENSORS:output_tensors" - input_side_packet: "CUSTOM_OP_RESOLVER:op_resolver" - node_options: { - [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { - model_path: "mediapipe/models/hand_landmark.tflite" - use_gpu: true - } - } -} - -# Splits a vector of tensors into multiple vectors. -node { - calculator: "SplitTfLiteTensorVectorCalculator" - input_stream: "output_tensors" - output_stream: "landmark_tensors" - output_stream: "hand_flag_tensor" - output_stream: "handedness_tensor" - node_options: { - [type.googleapis.com/mediapipe.SplitVectorCalculatorOptions] { - ranges: { begin: 0 end: 1 } - ranges: { begin: 1 end: 2 } - ranges: { begin: 2 end: 3 } - } - } -} - -# Converts the hand-flag tensor into a float that represents the confidence -# score of hand presence. -node { - calculator: "TfLiteTensorsToFloatsCalculator" - input_stream: "TENSORS:hand_flag_tensor" - output_stream: "FLOAT:hand_presence_score" -} - -# Converts the handedness tensor into a float as the score of the handedness -# binary classifciation. -node { - calculator: "TfLiteTensorsToClassificationCalculator" - input_stream: "TENSORS:handedness_tensor" - output_stream: "CLASSIFICATIONS:handedness" - node_options: { - [type.googleapis.com/mediapipe.TfLiteTensorsToClassificationCalculatorOptions] { - top_k: 1 - label_map_path: "mediapipe/models/handedness.txt" - binary_classification: true - } - } -} - -# Applies a threshold to the confidence score to determine whether a hand is -# present. -node { - calculator: "ThresholdingCalculator" - input_stream: "FLOAT:hand_presence_score" - output_stream: "FLAG:hand_presence" - node_options: { - [type.googleapis.com/mediapipe.ThresholdingCalculatorOptions] { - threshold: 0.5 - } - } -} - -# Decodes the landmark tensors into a list of landmarks, where the landmark -# coordinates are normalized by the size of the input image to the model. -node { - calculator: "TfLiteTensorsToLandmarksCalculator" - input_stream: "TENSORS:landmark_tensors" - output_stream: "NORM_LANDMARKS:landmarks" - node_options: { - [type.googleapis.com/mediapipe.TfLiteTensorsToLandmarksCalculatorOptions] { - num_landmarks: 21 - input_image_width: 256 - input_image_height: 256 - # The additional scaling factor is used to account for the Z coordinate - # distribution in the training data. - normalize_z: 0.4 - } - } -} - -# Adjusts landmarks (already normalized to [0.f, 1.f]) on the letterboxed hand -# image (after image transformation with the FIT scale mode) to the -# corresponding locations on the same image with the letterbox removed (hand -# image before image transformation). -node { - calculator: "LandmarkLetterboxRemovalCalculator" - input_stream: "LANDMARKS:landmarks" - input_stream: "LETTERBOX_PADDING:letterbox_padding" - output_stream: "LANDMARKS:scaled_landmarks" -} - -# Projects the landmarks from the cropped hand image to the corresponding -# locations on the full image before cropping (input to the graph). -node { - calculator: "LandmarkProjectionCalculator" - input_stream: "NORM_LANDMARKS:scaled_landmarks" - input_stream: "NORM_RECT:hand_rect" - output_stream: "NORM_LANDMARKS:hand_landmarks" -} - -# Extracts image size from the input images. -node { - calculator: "ImagePropertiesCalculator" - input_stream: "IMAGE_GPU:input_video" - output_stream: "SIZE:image_size" -} - -# Extracts a subset of the hand landmarks that are relatively more stable across -# frames (e.g. comparing to finger tips) for computing the bounding box. The box -# will later be expanded to contain the entire hand. In this approach, it is -# more robust to drastically changing hand size. -# The landmarks extracted are: wrist, MCP/PIP of five fingers. -node { - calculator: "SplitNormalizedLandmarkListCalculator" - input_stream: "hand_landmarks" - output_stream: "partial_landmarks" - node_options: { - [type.googleapis.com/mediapipe.SplitVectorCalculatorOptions] { - ranges: { begin: 0 end: 4 } - ranges: { begin: 5 end: 7 } - ranges: { begin: 9 end: 11 } - ranges: { begin: 13 end: 15 } - ranges: { begin: 17 end: 19 } - combine_outputs: true - } - } -} - -# Converts the hand landmarks into a rectangle (normalized by image size) -# that encloses the hand. The calculator uses a subset of all hand landmarks -# extracted from SplitNormalizedLandmarkListCalculator above to -# calculate the bounding box and the rotation of the output rectangle. Please -# see the comments in the calculator for more detail. -node { - calculator: "HandLandmarksToRectCalculator" - input_stream: "NORM_LANDMARKS:partial_landmarks" - input_stream: "IMAGE_SIZE:image_size" - output_stream: "NORM_RECT:hand_rect_from_landmarks" -} - -# Expands the hand rectangle so that the box contains the entire hand and it's -# big enough so that it's likely to still contain the hand even with some motion -# in the next video frame . -node { - calculator: "RectTransformationCalculator" - input_stream: "NORM_RECT:hand_rect_from_landmarks" - input_stream: "IMAGE_SIZE:image_size" - output_stream: "hand_rect_for_next_frame" - node_options: { - [type.googleapis.com/mediapipe.RectTransformationCalculatorOptions] { - scale_x: 2.0 - scale_y: 2.0 - shift_y: -0.1 - square_long: true - } - } -} diff --git a/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_renderer_cpu.pbtxt b/mediapipe/graphs/hand_tracking/subgraphs/hand_renderer_cpu.pbtxt similarity index 67% rename from mediapipe/graphs/hand_tracking/subgraphs/multi_hand_renderer_cpu.pbtxt rename to mediapipe/graphs/hand_tracking/subgraphs/hand_renderer_cpu.pbtxt index 5bc99c0339..eed1388741 100644 --- a/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_renderer_cpu.pbtxt +++ b/mediapipe/graphs/hand_tracking/subgraphs/hand_renderer_cpu.pbtxt @@ -1,16 +1,25 @@ -# MediaPipe multi-hand tracking rendering subgraph. +# MediaPipe graph to render hand landmarks and some related debug information. -type: "MultiHandRendererSubgraph" +type: "HandRendererSubgraph" +# CPU image. (ImageFrame) input_stream: "IMAGE:input_image" -# A vector of NormalizedLandmarks, one for each hand. +# Collection of detected/predicted hands, each represented as a list of +# landmarks. (std::vector) input_stream: "LANDMARKS:multi_hand_landmarks" -# A vector of NormalizedRect, one for each hand. +# Handedness of the detected hand (i.e. is hand left or right). +# (std::vector) +input_stream: "HANDEDNESS:multi_handedness" +# Regions of interest calculated based on palm detections. +# (std::vector) input_stream: "NORM_RECTS:0:multi_palm_rects" -# A vector of NormalizedRect, one for each hand. +# Regions of interest calculated based on landmarks. +# (std::vector) input_stream: "NORM_RECTS:1:multi_hand_rects" -# A vector of Detection, one for each hand. +# Detected palms. (std::vector) input_stream: "DETECTIONS:palm_detections" + +# Updated CPU image. (ImageFrame) output_stream: "IMAGE:output_image" # Converts detections to drawing primitives for annotation overlay. @@ -131,6 +140,61 @@ node { output_stream: "ITERABLE:multi_hand_landmarks_render_data" } +# Don't render handedness if there are more than one handedness reported. +node { + calculator: "ClassificationListVectorHasMinSizeCalculator" + input_stream: "ITERABLE:multi_handedness" + output_stream: "disallow_handedness_rendering" + node_options: { + [type.googleapis.com/mediapipe.CollectionHasMinSizeCalculatorOptions] { + min_size: 2 + } + } +} + +node { + calculator: "GateCalculator" + input_stream: "multi_handedness" + input_stream: "DISALLOW:disallow_handedness_rendering" + output_stream: "allowed_multi_handedness" + node_options: { + [type.googleapis.com/mediapipe.GateCalculatorOptions] { + empty_packets_as_allow: false + } + } +} + +node { + calculator: "SplitClassificationListVectorCalculator" + input_stream: "allowed_multi_handedness" + output_stream: "handedness" + node_options: { + [type.googleapis.com/mediapipe.SplitVectorCalculatorOptions] { + ranges: { begin: 0 end: 1 } + element_only: true + } + } +} + +# Converts classification to drawing primitives for annotation overlay. +node { + calculator: "LabelsToRenderDataCalculator" + input_stream: "CLASSIFICATIONS:handedness" + output_stream: "RENDER_DATA:handedness_render_data" + node_options: { + [type.googleapis.com/mediapipe.LabelsToRenderDataCalculatorOptions]: { + color { r: 255 g: 0 b: 0 } + thickness: 10.0 + font_height_px: 50 + horizontal_offset_px: 30 + vertical_offset_px: 50 + + max_num_labels: 1 + location: TOP_LEFT + } + } +} + # Draws annotations and overlays them on top of the input images. Consumes # a vector of RenderData objects and draws each of them on the input frame. node { @@ -139,6 +203,7 @@ node { input_stream: "detection_render_data" input_stream: "multi_hand_rects_render_data" input_stream: "multi_palm_rects_render_data" + input_stream: "handedness_render_data" input_stream: "VECTOR:0:multi_hand_landmarks_render_data" output_stream: "IMAGE:output_image" } diff --git a/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_renderer_gpu.pbtxt b/mediapipe/graphs/hand_tracking/subgraphs/hand_renderer_gpu.pbtxt similarity index 67% rename from mediapipe/graphs/hand_tracking/subgraphs/multi_hand_renderer_gpu.pbtxt rename to mediapipe/graphs/hand_tracking/subgraphs/hand_renderer_gpu.pbtxt index 82bd4ce900..9f0af85e26 100644 --- a/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_renderer_gpu.pbtxt +++ b/mediapipe/graphs/hand_tracking/subgraphs/hand_renderer_gpu.pbtxt @@ -1,16 +1,25 @@ -# MediaPipe multi-hand tracking rendering subgraph. +# MediaPipe graph to render hand landmarks and some related debug information. -type: "MultiHandRendererSubgraph" +type: "HandRendererSubgraph" +# GPU buffer. (GpuBuffer) input_stream: "IMAGE:input_image" -# A vector of NormalizedLandmarks, one for each hand. +# Collection of detected/predicted hands, each represented as a list of +# landmarks. (std::vector) input_stream: "LANDMARKS:multi_hand_landmarks" -# A vector of NormalizedRect, one for each hand. +# Handedness of the detected hand (i.e. is hand left or right). +# (std::vector) +input_stream: "HANDEDNESS:multi_handedness" +# Regions of interest calculated based on palm detections. +# (std::vector) input_stream: "NORM_RECTS:0:multi_palm_rects" -# A vector of NormalizedRect, one for each hand. +# Regions of interest calculated based on landmarks. +# (std::vector) input_stream: "NORM_RECTS:1:multi_hand_rects" -# A vector of Detection, one for each hand. +# Detected palms. (std::vector) input_stream: "DETECTIONS:palm_detections" + +# Updated GPU buffer. (GpuBuffer) output_stream: "IMAGE:output_image" # Converts detections to drawing primitives for annotation overlay. @@ -131,6 +140,61 @@ node { output_stream: "ITERABLE:multi_hand_landmarks_render_data" } +# Don't render handedness if there are more than one handedness reported. +node { + calculator: "ClassificationListVectorHasMinSizeCalculator" + input_stream: "ITERABLE:multi_handedness" + output_stream: "disallow_handedness_rendering" + node_options: { + [type.googleapis.com/mediapipe.CollectionHasMinSizeCalculatorOptions] { + min_size: 2 + } + } +} + +node { + calculator: "GateCalculator" + input_stream: "multi_handedness" + input_stream: "DISALLOW:disallow_handedness_rendering" + output_stream: "allowed_multi_handedness" + node_options: { + [type.googleapis.com/mediapipe.GateCalculatorOptions] { + empty_packets_as_allow: false + } + } +} + +node { + calculator: "SplitClassificationListVectorCalculator" + input_stream: "allowed_multi_handedness" + output_stream: "handedness" + node_options: { + [type.googleapis.com/mediapipe.SplitVectorCalculatorOptions] { + ranges: { begin: 0 end: 1 } + element_only: true + } + } +} + +# Converts classification to drawing primitives for annotation overlay. +node { + calculator: "LabelsToRenderDataCalculator" + input_stream: "CLASSIFICATIONS:handedness" + output_stream: "RENDER_DATA:handedness_render_data" + node_options: { + [type.googleapis.com/mediapipe.LabelsToRenderDataCalculatorOptions]: { + color { r: 255 g: 0 b: 0 } + thickness: 10.0 + font_height_px: 50 + horizontal_offset_px: 30 + vertical_offset_px: 50 + + max_num_labels: 1 + location: TOP_LEFT + } + } +} + # Draws annotations and overlays them on top of the input images. Consumes # a vector of RenderData objects and draws each of them on the input frame. node { @@ -139,6 +203,7 @@ node { input_stream: "detection_render_data" input_stream: "multi_hand_rects_render_data" input_stream: "multi_palm_rects_render_data" + input_stream: "handedness_render_data" input_stream: "VECTOR:0:multi_hand_landmarks_render_data" output_stream: "IMAGE_GPU:output_image" } diff --git a/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_detection_cpu.pbtxt b/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_detection_cpu.pbtxt deleted file mode 100644 index 928e752138..0000000000 --- a/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_detection_cpu.pbtxt +++ /dev/null @@ -1,212 +0,0 @@ -# MediaPipe multi-hand detection subgraph. - -type: "MultiHandDetectionSubgraph" - -input_stream: "input_video" -output_stream: "DETECTIONS:palm_detections" -output_stream: "NORM_RECTS:clipped_hand_rects_from_palm_detections" - -# Transforms the input image on CPU to a 256x256 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE:input_video" - output_stream: "IMAGE:transformed_input_video" - output_stream: "LETTERBOX_PADDING:letterbox_padding" - node_options: { - [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { - output_width: 256 - output_height: 256 - scale_mode: FIT - } - } -} - -# Generates a single side packet containing a TensorFlow Lite op resolver that -# supports custom ops needed by the model used in this graph. -node { - calculator: "TfLiteCustomOpResolverCalculator" - output_side_packet: "opresolver" -} - -# Converts the transformed input image on CPU into an image tensor as a -# TfLiteTensor. The zero_center option is set to true to normalize the -# pixel values to [-1.f, 1.f] as opposed to [0.f, 1.f]. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE:transformed_input_video" - output_stream: "TENSORS:image_tensor" -} - -# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a -# vector of tensors representing, for instance, detection boxes/keypoints and -# scores. -node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS:image_tensor" - output_stream: "TENSORS:detection_tensors" - input_side_packet: "CUSTOM_OP_RESOLVER:opresolver" - node_options: { - [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { - model_path: "mediapipe/models/palm_detection.tflite" - } - } -} - -# Generates a single side packet containing a vector of SSD anchors based on -# the specification in the options. -node { - calculator: "SsdAnchorsCalculator" - output_side_packet: "anchors" - node_options: { - [type.googleapis.com/mediapipe.SsdAnchorsCalculatorOptions] { - num_layers: 5 - min_scale: 0.1171875 - max_scale: 0.75 - input_size_height: 256 - input_size_width: 256 - anchor_offset_x: 0.5 - anchor_offset_y: 0.5 - strides: 8 - strides: 16 - strides: 32 - strides: 32 - strides: 32 - aspect_ratios: 1.0 - fixed_anchor_size: true - } - } -} - -# Decodes the detection tensors generated by the TensorFlow Lite model, based on -# the SSD anchors and the specification in the options, into a vector of -# detections. Each detection describes a detected object. -node { - calculator: "TfLiteTensorsToDetectionsCalculator" - input_stream: "TENSORS:detection_tensors" - input_side_packet: "ANCHORS:anchors" - output_stream: "DETECTIONS:detections" - node_options: { - [type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] { - num_classes: 1 - num_boxes: 2944 - num_coords: 18 - box_coord_offset: 0 - keypoint_coord_offset: 4 - num_keypoints: 7 - num_values_per_keypoint: 2 - sigmoid_score: true - score_clipping_thresh: 100.0 - reverse_output_order: true - - x_scale: 256.0 - y_scale: 256.0 - h_scale: 256.0 - w_scale: 256.0 - min_score_thresh: 0.7 - } - } -} - -# Performs non-max suppression to remove excessive detections. -node { - calculator: "NonMaxSuppressionCalculator" - input_stream: "detections" - output_stream: "filtered_detections" - node_options: { - [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { - min_suppression_threshold: 0.3 - overlap_type: INTERSECTION_OVER_UNION - algorithm: WEIGHTED - return_empty_detections: true - } - } -} - -# Maps detection label IDs to the corresponding label text ("Palm"). The label -# map is provided in the label_map_path option. -node { - calculator: "DetectionLabelIdToTextCalculator" - input_stream: "filtered_detections" - output_stream: "labeled_detections" - node_options: { - [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { - label_map_path: "mediapipe/models/palm_detection_labelmap.txt" - } - } -} - -# Adjusts detection locations (already normalized to [0.f, 1.f]) on the -# letterboxed image (after image transformation with the FIT scale mode) to the -# corresponding locations on the same image with the letterbox removed (the -# input image to the graph before image transformation). -node { - calculator: "DetectionLetterboxRemovalCalculator" - input_stream: "DETECTIONS:labeled_detections" - input_stream: "LETTERBOX_PADDING:letterbox_padding" - output_stream: "DETECTIONS:palm_detections" -} - -# Extracts image size from the input images. -node { - calculator: "ImagePropertiesCalculator" - input_stream: "IMAGE:input_video" - output_stream: "SIZE:image_size" -} - -# Converts each palm detection into a rectangle (normalized by image size) -# that encloses the palm and is rotated such that the line connecting center of -# the wrist and MCP of the middle finger is aligned with the Y-axis of the -# rectangle. -node { - calculator: "DetectionsToRectsCalculator" - input_stream: "DETECTIONS:palm_detections" - input_stream: "IMAGE_SIZE:image_size" - output_stream: "NORM_RECTS:palm_rects" - node_options: { - [type.googleapis.com/mediapipe.DetectionsToRectsCalculatorOptions] { - rotation_vector_start_keypoint_index: 0 # Center of wrist. - rotation_vector_end_keypoint_index: 2 # MCP of middle finger. - rotation_vector_target_angle_degrees: 90 - output_zero_rect_for_empty_detections: true - } - } -} - -# Expands and shifts the rectangle that contains the palm so that it's likely -# to cover the entire hand. -node { - calculator: "RectTransformationCalculator" - input_stream: "NORM_RECTS:palm_rects" - input_stream: "IMAGE_SIZE:image_size" - output_stream: "hand_rects_from_palm_detections" - node_options: { - [type.googleapis.com/mediapipe.RectTransformationCalculatorOptions] { - scale_x: 2.6 - scale_y: 2.6 - shift_y: -0.5 - square_long: true - } - } -} - -# Clips the size of the input vector to the provided max_vec_size. This -# determines the maximum number of hand instances this graph outputs. -# Note that the performance gain of clipping detections earlier in this graph is -# minimal because NMS will minimize overlapping detections and the number of -# detections isn't expected to exceed 5-10. -node { - calculator: "ClipNormalizedRectVectorSizeCalculator" - input_stream: "hand_rects_from_palm_detections" - output_stream: "clipped_hand_rects_from_palm_detections" - node_options: { - [type.googleapis.com/mediapipe.ClipVectorSizeCalculatorOptions] { - # This value can be changed to support tracking arbitrary number of hands. - # Please also remember to modify min_size in - # CollectionHsMinSizeCalculatorOptions in - # mediapipe/graphs/hand_tracking/multi_hand_tracking_desktop.pbtxt. - max_vec_size: 2 - } - } -} diff --git a/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_detection_gpu.pbtxt b/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_detection_gpu.pbtxt deleted file mode 100644 index afd1fd1524..0000000000 --- a/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_detection_gpu.pbtxt +++ /dev/null @@ -1,218 +0,0 @@ -# MediaPipe multi-hand detection subgraph. - -type: "MultiHandDetectionSubgraph" - -input_stream: "input_video" -output_stream: "DETECTIONS:palm_detections" -output_stream: "NORM_RECTS:clipped_hand_rects_from_palm_detections" - -# Transforms the input image on GPU to a 256x256 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE_GPU:input_video" - output_stream: "IMAGE_GPU:transformed_input_video" - output_stream: "LETTERBOX_PADDING:letterbox_padding" - node_options: { - [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { - output_width: 256 - output_height: 256 - scale_mode: FIT - } - } -} - -# Generates a single side packet containing a TensorFlow Lite op resolver that -# supports custom ops needed by the model used in this graph. -node { - calculator: "TfLiteCustomOpResolverCalculator" - output_side_packet: "opresolver" - node_options: { - [type.googleapis.com/mediapipe.TfLiteCustomOpResolverCalculatorOptions] { - use_gpu: true - } - } -} - -# Converts the transformed input image on GPU into an image tensor stored as a -# TfLiteTensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE_GPU:transformed_input_video" - output_stream: "TENSORS_GPU:image_tensor" -} - -# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a -# vector of tensors representing, for instance, detection boxes/keypoints and -# scores. -node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS_GPU:image_tensor" - output_stream: "TENSORS_GPU:detection_tensors" - input_side_packet: "CUSTOM_OP_RESOLVER:opresolver" - node_options: { - [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { - model_path: "mediapipe/models/palm_detection.tflite" - use_gpu: true - } - } -} - -# Generates a single side packet containing a vector of SSD anchors based on -# the specification in the options. -node { - calculator: "SsdAnchorsCalculator" - output_side_packet: "anchors" - node_options: { - [type.googleapis.com/mediapipe.SsdAnchorsCalculatorOptions] { - num_layers: 5 - min_scale: 0.1171875 - max_scale: 0.75 - input_size_height: 256 - input_size_width: 256 - anchor_offset_x: 0.5 - anchor_offset_y: 0.5 - strides: 8 - strides: 16 - strides: 32 - strides: 32 - strides: 32 - aspect_ratios: 1.0 - fixed_anchor_size: true - } - } -} - -# Decodes the detection tensors generated by the TensorFlow Lite model, based on -# the SSD anchors and the specification in the options, into a vector of -# detections. Each detection describes a detected object. -node { - calculator: "TfLiteTensorsToDetectionsCalculator" - input_stream: "TENSORS_GPU:detection_tensors" - input_side_packet: "ANCHORS:anchors" - output_stream: "DETECTIONS:detections" - node_options: { - [type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] { - num_classes: 1 - num_boxes: 2944 - num_coords: 18 - box_coord_offset: 0 - keypoint_coord_offset: 4 - num_keypoints: 7 - num_values_per_keypoint: 2 - sigmoid_score: true - score_clipping_thresh: 100.0 - reverse_output_order: true - - x_scale: 256.0 - y_scale: 256.0 - h_scale: 256.0 - w_scale: 256.0 - min_score_thresh: 0.7 - } - } -} - -# Performs non-max suppression to remove excessive detections. -node { - calculator: "NonMaxSuppressionCalculator" - input_stream: "detections" - output_stream: "filtered_detections" - node_options: { - [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { - min_suppression_threshold: 0.3 - overlap_type: INTERSECTION_OVER_UNION - algorithm: WEIGHTED - return_empty_detections: true - } - } -} - -# Maps detection label IDs to the corresponding label text ("Palm"). The label -# map is provided in the label_map_path option. -node { - calculator: "DetectionLabelIdToTextCalculator" - input_stream: "filtered_detections" - output_stream: "labeled_detections" - node_options: { - [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { - label_map_path: "mediapipe/models/palm_detection_labelmap.txt" - } - } -} - -# Adjusts detection locations (already normalized to [0.f, 1.f]) on the -# letterboxed image (after image transformation with the FIT scale mode) to the -# corresponding locations on the same image with the letterbox removed (the -# input image to the graph before image transformation). -node { - calculator: "DetectionLetterboxRemovalCalculator" - input_stream: "DETECTIONS:labeled_detections" - input_stream: "LETTERBOX_PADDING:letterbox_padding" - output_stream: "DETECTIONS:palm_detections" -} - -# Extracts image size from the input images. -node { - calculator: "ImagePropertiesCalculator" - input_stream: "IMAGE_GPU:input_video" - output_stream: "SIZE:image_size" -} - -# Converts each palm detection into a rectangle (normalized by image size) -# that encloses the palm and is rotated such that the line connecting center of -# the wrist and MCP of the middle finger is aligned with the Y-axis of the -# rectangle. -node { - calculator: "DetectionsToRectsCalculator" - input_stream: "DETECTIONS:palm_detections" - input_stream: "IMAGE_SIZE:image_size" - output_stream: "NORM_RECTS:palm_rects" - node_options: { - [type.googleapis.com/mediapipe.DetectionsToRectsCalculatorOptions] { - rotation_vector_start_keypoint_index: 0 # Center of wrist. - rotation_vector_end_keypoint_index: 2 # MCP of middle finger. - rotation_vector_target_angle_degrees: 90 - output_zero_rect_for_empty_detections: true - } - } -} - -# Expands and shifts the rectangle that contains the palm so that it's likely -# to cover the entire hand. -node { - calculator: "RectTransformationCalculator" - input_stream: "NORM_RECTS:palm_rects" - input_stream: "IMAGE_SIZE:image_size" - output_stream: "hand_rects_from_palm_detections" - node_options: { - [type.googleapis.com/mediapipe.RectTransformationCalculatorOptions] { - scale_x: 2.6 - scale_y: 2.6 - shift_y: -0.5 - square_long: true - } - } -} - -# Clips the size of the input vector to the provided max_vec_size. This -# determines the maximum number of hand instances this graph outputs. -# Note that the performance gain of clipping detections earlier in this graph is -# minimal because NMS will minimize overlapping detections and the number of -# detections isn't expected to exceed 5-10. -node { - calculator: "ClipNormalizedRectVectorSizeCalculator" - input_stream: "hand_rects_from_palm_detections" - output_stream: "clipped_hand_rects_from_palm_detections" - node_options: { - [type.googleapis.com/mediapipe.ClipVectorSizeCalculatorOptions] { - # This value can be changed to support tracking arbitrary number of hands. - # Please also remember to modify min_size in - # CollectionHsMinSizeCalculatorOptions in - # mediapipe/graphs/hand_tracking/multi_hand_tracking_mobile.pbtxt and - # mediapipe/graphs/hand_tracking/multi_hand_tracking_desktop_live.pbtxt. - max_vec_size: 2 - } - } -} diff --git a/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_landmark.pbtxt b/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_landmark.pbtxt deleted file mode 100644 index 08b283a80e..0000000000 --- a/mediapipe/graphs/hand_tracking/subgraphs/multi_hand_landmark.pbtxt +++ /dev/null @@ -1,84 +0,0 @@ -# MediaPipe hand landmark localization subgraph. - -type: "MultiHandLandmarkSubgraph" - -input_stream: "IMAGE:input_video" -# A vector of NormalizedRect, one per each hand detected. -input_stream: "NORM_RECTS:multi_hand_rects" -# A vector of NormalizedLandmarks, one set per each hand. -output_stream: "LANDMARKS:filtered_multi_hand_landmarks" -# A vector of NormalizedRect, one per each hand. -output_stream: "NORM_RECTS:filtered_multi_hand_rects_for_next_frame" - -# Outputs each element of multi_hand_rects at a fake timestamp for the rest -# of the graph to process. Clones the input_video packet for each -# single_hand_rect at the fake timestamp. At the end of the loop, -# outputs the BATCH_END timestamp for downstream calculators to inform them -# that all elements in the vector have been processed. -node { - calculator: "BeginLoopNormalizedRectCalculator" - input_stream: "ITERABLE:multi_hand_rects" - input_stream: "CLONE:input_video" - output_stream: "ITEM:single_hand_rect" - output_stream: "CLONE:input_video_cloned" - output_stream: "BATCH_END:single_hand_rect_timestamp" -} - -node { - calculator: "HandLandmarkSubgraph" - input_stream: "IMAGE:input_video_cloned" - input_stream: "NORM_RECT:single_hand_rect" - output_stream: "LANDMARKS:single_hand_landmarks" - output_stream: "NORM_RECT:single_hand_rect_from_landmarks" - output_stream: "PRESENCE:single_hand_presence" -} - -# Collects the boolean presence value for each single hand into a vector. Upon -# receiving the BATCH_END timestamp, outputs a vector of boolean values at the -# BATCH_END timestamp. -node { - calculator: "EndLoopBooleanCalculator" - input_stream: "ITEM:single_hand_presence" - input_stream: "BATCH_END:single_hand_rect_timestamp" - output_stream: "ITERABLE:multi_hand_presence" -} - -# Collects a set of landmarks for each hand into a vector. Upon receiving the -# BATCH_END timestamp, outputs the vector of landmarks at the BATCH_END -# timestamp. -node { - calculator: "EndLoopNormalizedLandmarkListVectorCalculator" - input_stream: "ITEM:single_hand_landmarks" - input_stream: "BATCH_END:single_hand_rect_timestamp" - output_stream: "ITERABLE:multi_hand_landmarks" -} - -# Collects a NormalizedRect for each hand into a vector. Upon receiving the -# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END -# timestamp. -node { - calculator: "EndLoopNormalizedRectCalculator" - input_stream: "ITEM:single_hand_rect_from_landmarks" - input_stream: "BATCH_END:single_hand_rect_timestamp" - output_stream: "ITERABLE:multi_hand_rects_for_next_frame" -} - -# Filters the input vector of landmarks based on hand presence value for each -# hand. If the hand presence for hand #i is false, the set of landmarks -# corresponding to that hand are dropped from the vector. -node { - calculator: "FilterLandmarkListCollectionCalculator" - input_stream: "ITERABLE:multi_hand_landmarks" - input_stream: "CONDITION:multi_hand_presence" - output_stream: "ITERABLE:filtered_multi_hand_landmarks" -} - -# Filters the input vector of NormalizedRect based on hand presence value for -# each hand. If the hand presence for hand #i is false, the NormalizedRect -# corresponding to that hand are dropped from the vector. -node { - calculator: "FilterNormalizedRectCollectionCalculator" - input_stream: "ITERABLE:multi_hand_rects_for_next_frame" - input_stream: "CONDITION:multi_hand_presence" - output_stream: "ITERABLE:filtered_multi_hand_rects_for_next_frame" -} diff --git a/mediapipe/graphs/hand_tracking/subgraphs/renderer_cpu.pbtxt b/mediapipe/graphs/hand_tracking/subgraphs/renderer_cpu.pbtxt deleted file mode 100644 index 0b9c4dc2ca..0000000000 --- a/mediapipe/graphs/hand_tracking/subgraphs/renderer_cpu.pbtxt +++ /dev/null @@ -1,123 +0,0 @@ -# MediaPipe hand tracking rendering subgraph. - -type: "RendererSubgraph" - -input_stream: "IMAGE:input_image" -input_stream: "DETECTIONS:detections" -input_stream: "LANDMARKS:landmarks" -input_stream: "NORM_RECT:rect" -input_stream: "HANDEDNESS:handedness" -output_stream: "IMAGE:output_image" - -# Converts classification to drawing primitives for annotation overlay. -node { - calculator: "LabelsToRenderDataCalculator" - input_stream: "CLASSIFICATIONS:handedness" - output_stream: "RENDER_DATA:handedness_render_data" - node_options: { - [type.googleapis.com/mediapipe.LabelsToRenderDataCalculatorOptions]: { - color { r: 255 g: 0 b: 0 } - thickness: 10.0 - font_height_px: 50 - horizontal_offset_px: 100 - vertical_offset_px: 100 - - max_num_labels: 1 - location: TOP_LEFT - } - } -} - -# Converts detections to drawing primitives for annotation overlay. -node { - calculator: "DetectionsToRenderDataCalculator" - input_stream: "DETECTIONS:detections" - output_stream: "RENDER_DATA:detection_render_data" - node_options: { - [type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] { - thickness: 4.0 - color { r: 0 g: 255 b: 0 } - } - } -} - -# Converts landmarks to drawing primitives for annotation overlay. -node { - calculator: "LandmarksToRenderDataCalculator" - input_stream: "NORM_LANDMARKS:landmarks" - output_stream: "RENDER_DATA:landmark_render_data" - node_options: { - [type.googleapis.com/mediapipe.LandmarksToRenderDataCalculatorOptions] { - landmark_connections: 0 - landmark_connections: 1 - landmark_connections: 1 - landmark_connections: 2 - landmark_connections: 2 - landmark_connections: 3 - landmark_connections: 3 - landmark_connections: 4 - landmark_connections: 0 - landmark_connections: 5 - landmark_connections: 5 - landmark_connections: 6 - landmark_connections: 6 - landmark_connections: 7 - landmark_connections: 7 - landmark_connections: 8 - landmark_connections: 5 - landmark_connections: 9 - landmark_connections: 9 - landmark_connections: 10 - landmark_connections: 10 - landmark_connections: 11 - landmark_connections: 11 - landmark_connections: 12 - landmark_connections: 9 - landmark_connections: 13 - landmark_connections: 13 - landmark_connections: 14 - landmark_connections: 14 - landmark_connections: 15 - landmark_connections: 15 - landmark_connections: 16 - landmark_connections: 13 - landmark_connections: 17 - landmark_connections: 0 - landmark_connections: 17 - landmark_connections: 17 - landmark_connections: 18 - landmark_connections: 18 - landmark_connections: 19 - landmark_connections: 19 - landmark_connections: 20 - landmark_color { r: 255 g: 0 b: 0 } - connection_color { r: 0 g: 255 b: 0 } - thickness: 4.0 - } - } -} - -# Converts normalized rects to drawing primitives for annotation overlay. -node { - calculator: "RectToRenderDataCalculator" - input_stream: "NORM_RECT:rect" - output_stream: "RENDER_DATA:rect_render_data" - node_options: { - [type.googleapis.com/mediapipe.RectToRenderDataCalculatorOptions] { - filled: false - color { r: 255 g: 0 b: 0 } - thickness: 4.0 - } - } -} - -# Draws annotations and overlays them on top of the input images. -node { - calculator: "AnnotationOverlayCalculator" - input_stream: "IMAGE:input_image" - input_stream: "detection_render_data" - input_stream: "landmark_render_data" - input_stream: "handedness_render_data" - input_stream: "rect_render_data" - output_stream: "IMAGE:output_image" -} diff --git a/mediapipe/graphs/hand_tracking/subgraphs/renderer_gpu.pbtxt b/mediapipe/graphs/hand_tracking/subgraphs/renderer_gpu.pbtxt deleted file mode 100644 index 06628935ab..0000000000 --- a/mediapipe/graphs/hand_tracking/subgraphs/renderer_gpu.pbtxt +++ /dev/null @@ -1,123 +0,0 @@ -# MediaPipe hand tracking rendering subgraph. - -type: "RendererSubgraph" - -input_stream: "IMAGE:input_image" -input_stream: "DETECTIONS:detections" -input_stream: "LANDMARKS:landmarks" -input_stream: "NORM_RECT:rect" -input_stream: "HANDEDNESS:handedness" -output_stream: "IMAGE:output_image" - -# Converts classification to drawing primitives for annotation overlay. -node { - calculator: "LabelsToRenderDataCalculator" - input_stream: "CLASSIFICATIONS:handedness" - output_stream: "RENDER_DATA:handedness_render_data" - node_options: { - [type.googleapis.com/mediapipe.LabelsToRenderDataCalculatorOptions]: { - color { r: 255 g: 0 b: 0 } - thickness: 10.0 - font_height_px: 50 - horizontal_offset_px: 200 - vertical_offset_px: 100 - - max_num_labels: 1 - location: TOP_LEFT - } - } -} - -# Converts detections to drawing primitives for annotation overlay. -node { - calculator: "DetectionsToRenderDataCalculator" - input_stream: "DETECTIONS:detections" - output_stream: "RENDER_DATA:detection_render_data" - node_options: { - [type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] { - thickness: 4.0 - color { r: 0 g: 255 b: 0 } - } - } -} - -# Converts landmarks to drawing primitives for annotation overlay. -node { - calculator: "LandmarksToRenderDataCalculator" - input_stream: "NORM_LANDMARKS:landmarks" - output_stream: "RENDER_DATA:landmark_render_data" - node_options: { - [type.googleapis.com/mediapipe.LandmarksToRenderDataCalculatorOptions] { - landmark_connections: 0 - landmark_connections: 1 - landmark_connections: 1 - landmark_connections: 2 - landmark_connections: 2 - landmark_connections: 3 - landmark_connections: 3 - landmark_connections: 4 - landmark_connections: 0 - landmark_connections: 5 - landmark_connections: 5 - landmark_connections: 6 - landmark_connections: 6 - landmark_connections: 7 - landmark_connections: 7 - landmark_connections: 8 - landmark_connections: 5 - landmark_connections: 9 - landmark_connections: 9 - landmark_connections: 10 - landmark_connections: 10 - landmark_connections: 11 - landmark_connections: 11 - landmark_connections: 12 - landmark_connections: 9 - landmark_connections: 13 - landmark_connections: 13 - landmark_connections: 14 - landmark_connections: 14 - landmark_connections: 15 - landmark_connections: 15 - landmark_connections: 16 - landmark_connections: 13 - landmark_connections: 17 - landmark_connections: 0 - landmark_connections: 17 - landmark_connections: 17 - landmark_connections: 18 - landmark_connections: 18 - landmark_connections: 19 - landmark_connections: 19 - landmark_connections: 20 - landmark_color { r: 255 g: 0 b: 0 } - connection_color { r: 0 g: 255 b: 0 } - thickness: 4.0 - } - } -} - -# Converts normalized rects to drawing primitives for annotation overlay. -node { - calculator: "RectToRenderDataCalculator" - input_stream: "NORM_RECT:rect" - output_stream: "RENDER_DATA:rect_render_data" - node_options: { - [type.googleapis.com/mediapipe.RectToRenderDataCalculatorOptions] { - filled: false - color { r: 255 g: 0 b: 0 } - thickness: 4.0 - } - } -} - -# Draws annotations and overlays them on top of the input images. -node { - calculator: "AnnotationOverlayCalculator" - input_stream: "IMAGE_GPU:input_image" - input_stream: "detection_render_data" - input_stream: "landmark_render_data" - input_stream: "handedness_render_data" - input_stream: "rect_render_data" - output_stream: "IMAGE_GPU:output_image" -} diff --git a/mediapipe/graphs/object_detection_3d/BUILD b/mediapipe/graphs/object_detection_3d/BUILD index 846aa6739f..c0704213b1 100644 --- a/mediapipe/graphs/object_detection_3d/BUILD +++ b/mediapipe/graphs/object_detection_3d/BUILD @@ -1,4 +1,4 @@ -# Copyright 2019 The MediaPipe Authors. +# Copyright 2020 The MediaPipe Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,6 +28,23 @@ exports_files(glob([ cc_library( name = "mobile_calculators", visibility = ["//visibility:public"], + deps = [ + "//mediapipe/calculators/core:constant_side_packet_calculator", + "//mediapipe/calculators/core:flow_limiter_calculator", + "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/core:merge_calculator", + "//mediapipe/calculators/core:previous_loopback_calculator", + "//mediapipe/calculators/image:image_cropping_calculator", + "//mediapipe/graphs/object_detection_3d/calculators:annotations_to_model_matrices_calculator", + "//mediapipe/graphs/object_detection_3d/calculators:gl_animation_overlay_calculator", + "//mediapipe/graphs/object_detection_3d/subgraphs:box_landmark_gpu", + "//mediapipe/graphs/object_detection_3d/subgraphs:object_detection_oid_v4_gpu", + ], +) + +cc_library( + name = "mobile_calculators_1stage", + visibility = ["//visibility:public"], deps = [ "//mediapipe/calculators/core:packet_resampler_calculator", "//mediapipe/calculators/image:image_cropping_calculator", @@ -40,17 +57,17 @@ cc_library( ) mediapipe_binary_graph( - name = "mobile_gpu_binary_graph_shoe", - graph = "shoe_classic_occlusion_tracking.pbtxt", - output_name = "mobile_gpu_shoe.binarypb", + name = "mobile_gpu_binary_graph", + graph = "object_occlusion_tracking.pbtxt", + output_name = "mobile_gpu_binary_graph.binarypb", visibility = ["//visibility:public"], deps = [":mobile_calculators"], ) mediapipe_binary_graph( - name = "mobile_gpu_binary_graph_chair", - graph = "chair_classic_occlusion_tracking.pbtxt", - output_name = "mobile_gpu_chair.binarypb", + name = "mobile_gpu_1stage_binary_graph", + graph = "object_occlusion_tracking_1stage.pbtxt", + output_name = "mobile_gpu_1stage_binary_graph.binarypb", visibility = ["//visibility:public"], - deps = [":mobile_calculators"], + deps = [":mobile_calculators_1stage"], ) diff --git a/mediapipe/graphs/object_detection_3d/calculators/BUILD b/mediapipe/graphs/object_detection_3d/calculators/BUILD index b126564f2e..d03174ff20 100644 --- a/mediapipe/graphs/object_detection_3d/calculators/BUILD +++ b/mediapipe/graphs/object_detection_3d/calculators/BUILD @@ -12,67 +12,65 @@ # See the License for the specific language governing permissions and # limitations under the License. -load("//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library") +load("//mediapipe/framework/port:build_config.bzl", "mediapipe_proto_library") licenses(["notice"]) package(default_visibility = ["//visibility:public"]) -proto_library( +mediapipe_proto_library( name = "object_proto", - srcs = [ - "object.proto", - ], + srcs = ["object.proto"], + visibility = ["//visibility:public"], ) -proto_library( +mediapipe_proto_library( name = "a_r_capture_metadata_proto", - srcs = [ - "a_r_capture_metadata.proto", - ], + srcs = ["a_r_capture_metadata.proto"], + visibility = ["//visibility:public"], ) -proto_library( +mediapipe_proto_library( name = "annotation_proto", - srcs = [ - "annotation_data.proto", - ], + srcs = ["annotation_data.proto"], + visibility = ["//visibility:public"], deps = [ ":a_r_capture_metadata_proto", ":object_proto", ], ) -proto_library( - name = "belief_decoder_config_proto", - srcs = [ - "belief_decoder_config.proto", - ], -) - -proto_library( +mediapipe_proto_library( name = "camera_parameters_proto", - srcs = [ - "camera_parameters.proto", - ], + srcs = ["camera_parameters.proto"], + visibility = ["//visibility:public"], ) -proto_library( +mediapipe_proto_library( name = "frame_annotation_tracker_calculator_proto", srcs = ["frame_annotation_tracker_calculator.proto"], + visibility = ["//visibility:public"], deps = [ "//mediapipe/framework:calculator_proto", ], ) -proto_library( +mediapipe_proto_library( name = "gl_animation_overlay_calculator_proto", srcs = ["gl_animation_overlay_calculator.proto"], visibility = ["//visibility:public"], - deps = ["//mediapipe/framework:calculator_proto"], + deps = [ + "//mediapipe/framework:calculator_proto", + ], +) + +mediapipe_proto_library( + name = "belief_decoder_config_proto", + srcs = ["belief_decoder_config.proto"], + visibility = ["//visibility:public"], ) -proto_library( +mediapipe_proto_library( name = "tflite_tensors_to_objects_calculator_proto", srcs = ["tflite_tensors_to_objects_calculator.proto"], visibility = ["//visibility:public"], @@ -82,7 +80,7 @@ proto_library( ], ) -proto_library( +mediapipe_proto_library( name = "lift_2d_frame_annotation_to_3d_calculator_proto", srcs = ["lift_2d_frame_annotation_to_3d_calculator.proto"], visibility = ["//visibility:public"], @@ -92,7 +90,7 @@ proto_library( ], ) -proto_library( +mediapipe_proto_library( name = "annotations_to_model_matrices_calculator_proto", srcs = ["annotations_to_model_matrices_calculator.proto"], visibility = ["//visibility:public"], @@ -101,7 +99,7 @@ proto_library( ], ) -proto_library( +mediapipe_proto_library( name = "model_matrix_proto", srcs = ["model_matrix.proto"], visibility = ["//visibility:public"], @@ -110,7 +108,7 @@ proto_library( ], ) -proto_library( +mediapipe_proto_library( name = "annotations_to_render_data_calculator_proto", srcs = ["annotations_to_render_data_calculator.proto"], visibility = ["//visibility:public"], @@ -120,112 +118,22 @@ proto_library( ], ) -mediapipe_cc_proto_library( - name = "object_cc_proto", - srcs = ["object.proto"], - visibility = ["//visibility:public"], - deps = [":object_proto"], -) - -mediapipe_cc_proto_library( - name = "a_r_capture_metadata_cc_proto", - srcs = ["a_r_capture_metadata.proto"], - visibility = ["//visibility:public"], - deps = [":a_r_capture_metadata_proto"], -) - -mediapipe_cc_proto_library( - name = "annotation_cc_proto", - srcs = ["annotation_data.proto"], - cc_deps = [ - ":a_r_capture_metadata_cc_proto", - ":object_cc_proto", - ], - visibility = ["//visibility:public"], - deps = [":annotation_proto"], -) - -mediapipe_cc_proto_library( - name = "camera_parameters_cc_proto", - srcs = ["camera_parameters.proto"], +mediapipe_proto_library( + name = "frame_annotation_to_rect_calculator_proto", + srcs = ["frame_annotation_to_rect_calculator.proto"], visibility = ["//visibility:public"], - deps = [":camera_parameters_proto"], -) - -mediapipe_cc_proto_library( - name = "frame_annotation_tracker_calculator_cc_proto", - srcs = ["frame_annotation_tracker_calculator.proto"], - cc_deps = [ - "//mediapipe/framework:calculator_cc_proto", - ], - visibility = ["//visibility:public"], - deps = [":frame_annotation_tracker_calculator_proto"], -) - -mediapipe_cc_proto_library( - name = "gl_animation_overlay_calculator_cc_proto", - srcs = ["gl_animation_overlay_calculator.proto"], - cc_deps = [ - "//mediapipe/framework:calculator_cc_proto", - ], - visibility = ["//visibility:public"], - deps = [":gl_animation_overlay_calculator_proto"], -) - -mediapipe_cc_proto_library( - name = "belief_decoder_config_cc_proto", - srcs = ["belief_decoder_config.proto"], - visibility = ["//visibility:public"], - deps = [":belief_decoder_config_proto"], -) - -mediapipe_cc_proto_library( - name = "tflite_tensors_to_objects_calculator_cc_proto", - srcs = ["tflite_tensors_to_objects_calculator.proto"], - cc_deps = [ - ":belief_decoder_config_cc_proto", - "//mediapipe/framework:calculator_cc_proto", - ], - visibility = ["//visibility:public"], - deps = [":tflite_tensors_to_objects_calculator_proto"], -) - -mediapipe_cc_proto_library( - name = "lift_2d_frame_annotation_to_3d_calculator_cc_proto", - srcs = ["lift_2d_frame_annotation_to_3d_calculator.proto"], - cc_deps = [ - ":belief_decoder_config_cc_proto", - "//mediapipe/framework:calculator_cc_proto", + deps = [ + "//mediapipe/framework:calculator_proto", ], - visibility = ["//visibility:public"], - deps = [":lift_2d_frame_annotation_to_3d_calculator_proto"], -) - -mediapipe_cc_proto_library( - name = "annotations_to_model_matrices_calculator_cc_proto", - srcs = ["annotations_to_model_matrices_calculator.proto"], - cc_deps = ["//mediapipe/framework:calculator_cc_proto"], - visibility = ["//visibility:public"], - deps = [":annotations_to_model_matrices_calculator_proto"], ) -mediapipe_cc_proto_library( - name = "model_matrix_cc_proto", - srcs = ["model_matrix.proto"], - cc_deps = ["//mediapipe/framework:calculator_cc_proto"], +mediapipe_proto_library( + name = "filter_detection_calculator_proto", + srcs = ["filter_detection_calculator.proto"], visibility = ["//visibility:public"], - deps = [":model_matrix_proto"], -) - -mediapipe_cc_proto_library( - name = "annotations_to_render_data_calculator_cc_proto", - srcs = ["annotations_to_render_data_calculator.proto"], - cc_deps = [ - "//mediapipe/framework:calculator_cc_proto", - "//mediapipe/util:color_cc_proto", + deps = [ + "//mediapipe/framework:calculator_proto", ], - visibility = ["//visibility:public"], - deps = [":annotations_to_render_data_calculator_proto"], ) cc_library( @@ -452,6 +360,55 @@ cc_library( alwayslink = 1, ) +cc_library( + name = "frame_annotation_to_rect_calculator", + srcs = ["frame_annotation_to_rect_calculator.cc"], + deps = [ + ":annotation_cc_proto", + ":box", + ":frame_annotation_to_rect_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:rect_cc_proto", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "@com_google_absl//absl/memory", + "@eigen_archive//:eigen", + ], + alwayslink = 1, +) + +cc_library( + name = "landmarks_to_frame_annotation_calculator", + srcs = ["landmarks_to_frame_annotation_calculator.cc"], + deps = [ + ":annotation_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:landmark_cc_proto", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "@com_google_absl//absl/memory", + ], + alwayslink = 1, +) + +cc_library( + name = "filter_detection_calculator", + srcs = ["filter_detection_calculator.cc"], + deps = [ + ":filter_detection_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:detection_cc_proto", + "//mediapipe/framework/formats:location_data_cc_proto", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:map_util", + "//mediapipe/framework/port:re2", + "//mediapipe/framework/port:status", + "@com_google_absl//absl/container:node_hash_set", + "@com_google_absl//absl/strings", + ], + alwayslink = 1, +) + cc_test( name = "box_util_test", srcs = ["box_util_test.cc"], diff --git a/mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.cc b/mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.cc index 220869945c..0072b51a1c 100644 --- a/mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.cc +++ b/mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.cc @@ -93,6 +93,14 @@ ::mediapipe::Status AnnotationsToModelMatricesCalculator::GetContract( if (cc->Outputs().HasTag(kModelMatricesTag)) { cc->Outputs().Tag(kModelMatricesTag).Set(); } + + if (cc->InputSidePackets().HasTag("MODEL_SCALE")) { + cc->InputSidePackets().Tag("MODEL_SCALE").Set(); + } + + if (cc->InputSidePackets().HasTag("MODEL_TRANSFORMATION")) { + cc->InputSidePackets().Tag("MODEL_TRANSFORMATION").Set(); + } return ::mediapipe::OkStatus(); } @@ -103,14 +111,20 @@ ::mediapipe::Status AnnotationsToModelMatricesCalculator::Open( cc->SetOffset(TimestampDiff(0)); options_ = cc->Options(); - if (options_.model_scale_size() == 3) { + if (cc->InputSidePackets().HasTag("MODEL_SCALE")) { + model_scale_ = Eigen::Map( + cc->InputSidePackets().Tag("MODEL_SCALE").Get()); + } else if (options_.model_scale_size() == 3) { model_scale_ = Eigen::Map(options_.model_scale().data()); } else { model_scale_.setOnes(); } - if (options_.model_transformation_size() == 16) { + if (cc->InputSidePackets().HasTag("MODEL_TRANSFORMATION")) { + model_transformation_ = Eigen::Map( + cc->InputSidePackets().Tag("MODEL_TRANSFORMATION").Get()); + } else if (options_.model_transformation_size() == 16) { model_transformation_ = Eigen::Map(options_.model_transformation().data()); } else { diff --git a/mediapipe/graphs/object_detection_3d/calculators/filter_detection_calculator.cc b/mediapipe/graphs/object_detection_3d/calculators/filter_detection_calculator.cc new file mode 100644 index 0000000000..293cc72ccc --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/filter_detection_calculator.cc @@ -0,0 +1,265 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "absl/container/node_hash_set.h" +#include "absl/strings/str_split.h" +#include "absl/strings/string_view.h" +#include "absl/strings/strip.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/framework/formats/location_data.pb.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/map_util.h" +#include "mediapipe/framework/port/re2.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/graphs/object_detection_3d/calculators/filter_detection_calculator.pb.h" + +namespace mediapipe { + +namespace { + +constexpr char kDetectionTag[] = "DETECTION"; +constexpr char kDetectionsTag[] = "DETECTIONS"; +constexpr char kLabelsTag[] = "LABELS"; +constexpr char kLabelsCsvTag[] = "LABELS_CSV"; + +using ::mediapipe::ContainsKey; +using ::mediapipe::RE2; +using Detections = std::vector; +using Strings = std::vector; + +} // namespace + +// Filters the entries in a Detection to only those with valid scores +// for the specified allowed labels. Allowed labels are provided as a +// vector in an optional input side packet. Allowed labels can +// contain simple strings or regular expressions. The valid score range +// can be set in the options.The allowed labels can be provided as +// vector (LABELS) or CSV std::string (LABELS_CSV) containing class +// names of allowed labels. Note: Providing an empty vector in the input side +// packet Packet causes this calculator to act as a sink if +// empty_allowed_labels_means_allow_everything is set to false (default value). +// To allow all labels, use the calculator with no input side packet stream, or +// set empty_allowed_labels_means_allow_everything to true. +// +// Example config: +// node { +// calculator: "FilterDetectionCalculator" +// input_stream: "DETECTIONS:detections" +// output_stream: "DETECTIONS:filtered_detections" +// input_side_packet: "LABELS:allowed_labels" +// node_options: { +// [type.googleapis.com/mediapipe.FilterDetectionCalculatorOptions]: { +// min_score: 0.5 +// } +// } +// } + +struct FirstGreaterComparator { + bool operator()(const std::pair& a, + const std::pair& b) const { + return a.first > b.first; + } +}; + +::mediapipe::Status SortLabelsByDecreasingScore(const Detection& detection, + Detection* sorted_detection) { + RET_CHECK(sorted_detection); + RET_CHECK_EQ(detection.score_size(), detection.label_size()); + if (!detection.label_id().empty()) { + RET_CHECK_EQ(detection.score_size(), detection.label_id_size()); + } + // Copies input to keep all fields unchanged, and to reserve space for + // repeated fields. Repeated fields (score, label, and label_id) will be + // overwritten. + *sorted_detection = detection; + + std::vector> scores_and_indices(detection.score_size()); + for (int i = 0; i < detection.score_size(); ++i) { + scores_and_indices[i].first = detection.score(i); + scores_and_indices[i].second = i; + } + + std::sort(scores_and_indices.begin(), scores_and_indices.end(), + FirstGreaterComparator()); + + for (int i = 0; i < detection.score_size(); ++i) { + const int index = scores_and_indices[i].second; + sorted_detection->set_score(i, detection.score(index)); + sorted_detection->set_label(i, detection.label(index)); + } + + if (!detection.label_id().empty()) { + for (int i = 0; i < detection.score_size(); ++i) { + const int index = scores_and_indices[i].second; + sorted_detection->set_label_id(i, detection.label_id(index)); + } + } + return ::mediapipe::OkStatus(); +} + +class FilterDetectionCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + + private: + bool IsValidLabel(const std::string& label); + bool IsValidScore(float score); + // Stores numeric limits for filtering on the score. + FilterDetectionCalculatorOptions options_; + // We use the next two fields to possibly filter to a limited set of + // classes. The hash_set will be empty in two cases: 1) if no input + // side packet stream is provided (not filtering on labels), or 2) + // if the input side packet contains an empty vector (no labels are + // allowed). We use limit_labels_ to distinguish between the two cases. + bool limit_labels_ = true; + absl::node_hash_set allowed_labels_; +}; +REGISTER_CALCULATOR(FilterDetectionCalculator); + +::mediapipe::Status FilterDetectionCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + + if (cc->Inputs().HasTag(kDetectionTag)) { + cc->Inputs().Tag(kDetectionTag).Set(); + cc->Outputs().Tag(kDetectionTag).Set(); + } + if (cc->Inputs().HasTag(kDetectionsTag)) { + cc->Inputs().Tag(kDetectionsTag).Set(); + cc->Outputs().Tag(kDetectionsTag).Set(); + } + if (cc->InputSidePackets().HasTag(kLabelsTag)) { + cc->InputSidePackets().Tag(kLabelsTag).Set(); + } + if (cc->InputSidePackets().HasTag(kLabelsCsvTag)) { + cc->InputSidePackets().Tag(kLabelsCsvTag).Set(); + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FilterDetectionCalculator::Open(CalculatorContext* cc) { + cc->SetOffset(TimestampDiff(0)); + options_ = cc->Options(); + limit_labels_ = cc->InputSidePackets().HasTag(kLabelsTag) || + cc->InputSidePackets().HasTag(kLabelsCsvTag); + if (limit_labels_) { + Strings whitelist_labels; + if (cc->InputSidePackets().HasTag(kLabelsCsvTag)) { + whitelist_labels = absl::StrSplit( + cc->InputSidePackets().Tag(kLabelsCsvTag).Get(), ',', + absl::SkipWhitespace()); + for (auto& e : whitelist_labels) { + absl::StripAsciiWhitespace(&e); + } + } else { + whitelist_labels = cc->InputSidePackets().Tag(kLabelsTag).Get(); + } + allowed_labels_.insert(whitelist_labels.begin(), whitelist_labels.end()); + } + if (limit_labels_ && allowed_labels_.empty()) { + if (options_.fail_on_empty_labels()) { + cc->GetCounter("VideosWithEmptyLabelsWhitelist")->Increment(); + return tool::StatusFail( + "FilterDetectionCalculator received empty whitelist with " + "fail_on_empty_labels = true."); + } + if (options_.empty_allowed_labels_means_allow_everything()) { + // Continue as if side_input was not provided, i.e. pass all labels. + limit_labels_ = false; + } + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FilterDetectionCalculator::Process(CalculatorContext* cc) { + if (limit_labels_ && allowed_labels_.empty()) { + return ::mediapipe::OkStatus(); + } + Detections detections; + if (cc->Inputs().HasTag(kDetectionsTag)) { + detections = cc->Inputs().Tag(kDetectionsTag).Get(); + } else if (cc->Inputs().HasTag(kDetectionTag)) { + detections.emplace_back(cc->Inputs().Tag(kDetectionsTag).Get()); + } + std::unique_ptr outputs(new Detections); + for (const auto& input : detections) { + Detection output; + for (int i = 0; i < input.label_size(); ++i) { + const std::string& label = input.label(i); + const float score = input.score(i); + if (IsValidLabel(label) && IsValidScore(score)) { + output.add_label(label); + output.add_score(score); + } + } + if (output.label_size() > 0) { + if (input.has_location_data()) { + *output.mutable_location_data() = input.location_data(); + } + Detection output_sorted; + if (!SortLabelsByDecreasingScore(output, &output_sorted).ok()) { + // Uses the orginal output if fails to sort. + cc->GetCounter("FailedToSortLabelsInDetection")->Increment(); + output_sorted = output; + } + outputs->emplace_back(output_sorted); + } + } + + if (cc->Outputs().HasTag(kDetectionsTag)) { + cc->Outputs() + .Tag(kDetectionsTag) + .Add(outputs.release(), cc->InputTimestamp()); + } else if (!outputs->empty()) { + cc->Outputs() + .Tag(kDetectionsTag) + .Add(new Detection((*outputs)[0]), cc->InputTimestamp()); + } + return ::mediapipe::OkStatus(); +} + +bool FilterDetectionCalculator::IsValidLabel(const std::string& label) { + bool match = !limit_labels_ || ContainsKey(allowed_labels_, label); + if (!match) { + // If no exact match is found, check for regular expression + // comparions in the allowed_labels. + for (const auto& label_regexp : allowed_labels_) { + match = match || RE2::FullMatch(label, RE2(label_regexp)); + } + } + return match; +} + +bool FilterDetectionCalculator::IsValidScore(float score) { + if (options_.has_min_score() && score < options_.min_score()) { + LOG(ERROR) << "Filter out detection with low score " << score; + return false; + } + if (options_.has_max_score() && score > options_.max_score()) { + LOG(ERROR) << "Filter out detection with high score " << score; + return false; + } + LOG(ERROR) << "Pass detection with score " << score; + return true; +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/filter_detection_calculator.proto b/mediapipe/graphs/object_detection_3d/calculators/filter_detection_calculator.proto new file mode 100644 index 0000000000..ea79b8d4bf --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/filter_detection_calculator.proto @@ -0,0 +1,45 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message FilterDetectionCalculatorOptions { + extend CalculatorOptions { + optional FilterDetectionCalculatorOptions ext = 339582987; + } + optional float min_score = 1; + optional float max_score = 2; + // Setting fail_on_empty_labels to true will cause the calculator to return a + // failure status on Open() if an empty list is provided on the external + // input, immediately terminating the graph run. + optional bool fail_on_empty_labels = 3 [default = false]; + // If fail_on_empty_labels is set to false setting + // empty_allowed_labels_means_allow_everything to + // false will cause the calculator to close output stream and ignore remaining + // inputs if an empty list is provided. If + // empty_allowed_labels_means_allow_everything is set to true this will force + // calculator to pass all labels. + optional bool empty_allowed_labels_means_allow_everything = 6 + [default = false]; + // Determines whether the input format is a vector (use-case object + // detectors) or Detection (use-case classifiers). + optional bool use_detection_vector = 4 [deprecated = true]; + // Determines whether the input side packet format is a vector of labels, or + // a string with comma separated labels. + optional bool use_allowed_labels_csv = 5 [deprecated = true]; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_to_rect_calculator.cc b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_to_rect_calculator.cc new file mode 100644 index 0000000000..0d6e9537a4 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_to_rect_calculator.cc @@ -0,0 +1,185 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and + +#include + +#include "Eigen/Dense" +#include "absl/memory/memory.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/rect.pb.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/box.h" +#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_to_rect_calculator.pb.h" + +namespace mediapipe { + +using Matrix3fRM = Eigen::Matrix; +using Eigen::Vector2f; +using Eigen::Vector3f; + +namespace { + +constexpr char kInputFrameAnnotationTag[] = "FRAME_ANNOTATION"; +constexpr char kOutputNormRectTag[] = "NORM_RECT"; + +} // namespace + +// A calculator that converts FrameAnnotation proto to NormalizedRect. +// The rotation angle of the NormalizedRect is derived from object's 3d pose. +// The angle is calculated such that after rotation the 2d projection of y-axis. +// on the image plane is always vertical. +class FrameAnnotationToRectCalculator : public CalculatorBase { + public: + enum ViewStatus { + TOP_VIEW_ON, + TOP_VIEW_OFF, + }; + + static ::mediapipe::Status GetContract(CalculatorContract* cc); + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + + private: + void AnnotationToRect(const FrameAnnotation& annotation, + NormalizedRect* rect); + float RotationAngleFromAnnotation(const FrameAnnotation& annotation); + + float RotationAngleFromPose(const Matrix3fRM& rotation, + const Vector3f& translation, const Vector3f& vec); + ViewStatus status_; + float off_threshold_; + float on_threshold_; +}; +REGISTER_CALCULATOR(FrameAnnotationToRectCalculator); + +::mediapipe::Status FrameAnnotationToRectCalculator::Open( + CalculatorContext* cc) { + status_ = TOP_VIEW_OFF; + const auto& options = cc->Options(); + off_threshold_ = options.off_threshold(); + on_threshold_ = options.on_threshold(); + RET_CHECK(off_threshold_ <= on_threshold_); + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FrameAnnotationToRectCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + + if (cc->Inputs().HasTag(kInputFrameAnnotationTag)) { + cc->Inputs().Tag(kInputFrameAnnotationTag).Set(); + } + + if (cc->Outputs().HasTag(kOutputNormRectTag)) { + cc->Outputs().Tag(kOutputNormRectTag).Set(); + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FrameAnnotationToRectCalculator::Process( + CalculatorContext* cc) { + if (cc->Inputs().Tag(kInputFrameAnnotationTag).IsEmpty()) { + return ::mediapipe::OkStatus(); + } + auto output_rect = absl::make_unique(); + AnnotationToRect( + cc->Inputs().Tag(kInputFrameAnnotationTag).Get(), + output_rect.get()); + + // Output + cc->Outputs() + .Tag(kOutputNormRectTag) + .Add(output_rect.release(), cc->InputTimestamp()); + return ::mediapipe::OkStatus(); +} + +void FrameAnnotationToRectCalculator::AnnotationToRect( + const FrameAnnotation& annotation, NormalizedRect* rect) { + float x_min = std::numeric_limits::max(); + float x_max = std::numeric_limits::min(); + float y_min = std::numeric_limits::max(); + float y_max = std::numeric_limits::min(); + const auto& object = annotation.annotations(0); + for (const auto& keypoint : object.keypoints()) { + const auto& point_2d = keypoint.point_2d(); + x_min = std::min(x_min, point_2d.x()); + x_max = std::max(x_max, point_2d.x()); + y_min = std::min(y_min, point_2d.y()); + y_max = std::max(y_max, point_2d.y()); + } + rect->set_x_center((x_min + x_max) / 2); + rect->set_y_center((y_min + y_max) / 2); + rect->set_width(x_max - x_min); + rect->set_height(y_max - y_min); + rect->set_rotation(RotationAngleFromAnnotation(annotation)); +} + +float FrameAnnotationToRectCalculator::RotationAngleFromAnnotation( + const FrameAnnotation& annotation) { + const auto& object = annotation.annotations(0); + Box box("category"); + std::vector vertices_3d; + std::vector vertices_2d; + for (const auto& keypoint : object.keypoints()) { + const auto& point_3d = keypoint.point_3d(); + const auto& point_2d = keypoint.point_2d(); + vertices_3d.emplace_back( + Vector3f(point_3d.x(), point_3d.y(), point_3d.z())); + vertices_2d.emplace_back(Vector2f(point_2d.x(), point_2d.y())); + } + box.Fit(vertices_3d); + Vector3f scale = box.GetScale(); + Matrix3fRM box_rotation = box.GetRotation(); + Vector3f box_translation = box.GetTranslation(); + + // Rotation angle to use when top-view is on(top-view on), + // Which will make z-axis upright after the rotation. + const float angle_on = + RotationAngleFromPose(box_rotation, box_translation, Vector3f::UnitZ()); + // Rotation angle to use when side-view is on(top-view off), + // Which will make y-axis upright after the rotation. + const float angle_off = + RotationAngleFromPose(box_rotation, box_translation, Vector3f::UnitY()); + + // Calculate angle between z-axis and viewing ray in degrees. + const float view_to_z_angle = std::acos(box_rotation(2, 1)) * 180 / M_PI; + + // Determine threshold based on current status, + // on_threshold_ is used for TOP_VIEW_ON -> TOP_VIEW_OFF transition, + // off_threshold_ is used for TOP_VIEW_OFF -> TOP_VIEW_ON transition. + const float thresh = + (status_ == TOP_VIEW_ON) ? on_threshold_ : off_threshold_; + + // If view_to_z_angle is smaller than threshold, then top-view is on; + // Otherwise top-view is off. + status_ = (view_to_z_angle < thresh) ? TOP_VIEW_ON : TOP_VIEW_OFF; + + // Determine which angle to used based on current status_. + float angle_to_rotate = (status_ == TOP_VIEW_ON) ? angle_on : angle_off; + return angle_to_rotate; +} + +float FrameAnnotationToRectCalculator::RotationAngleFromPose( + const Matrix3fRM& rotation, const Vector3f& translation, + const Vector3f& vec) { + auto p1 = rotation * vec + translation; + auto p2 = -rotation * vec + translation; + const float dy = p2[2] * p1[1] - p1[2] * p2[1]; + const float dx = p2[2] * p1[0] - p1[2] * p2[0]; + return std::atan2(-dy, dx); +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_to_rect_calculator.proto b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_to_rect_calculator.proto new file mode 100644 index 0000000000..8959cb868f --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_to_rect_calculator.proto @@ -0,0 +1,31 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message FrameAnnotationToRectCalculatorOptions { + extend CalculatorOptions { + optional FrameAnnotationToRectCalculatorOptions ext = 338119067; + } + + // The threshold to use when top-view is off,to enable hysteresis, + // It's required that off_threshold <= on_threshold. + optional float off_threshold = 1 [default = 40.0]; + // The threshold to use when top-view is on. + optional float on_threshold = 2 [default = 41.0]; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/landmarks_to_frame_annotation_calculator.cc b/mediapipe/graphs/object_detection_3d/calculators/landmarks_to_frame_annotation_calculator.cc new file mode 100644 index 0000000000..a5203c084f --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/landmarks_to_frame_annotation_calculator.cc @@ -0,0 +1,76 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and + +#include "absl/memory/memory.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/landmark.pb.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" + +namespace mediapipe { + +namespace { + +constexpr char kInputLandmarksTag[] = "LANDMARKS"; +constexpr char kOutputFrameAnnotationTag[] = "FRAME_ANNOTATION"; + +} // namespace + +// A calculator that converts NormalizedLandmarkList to FrameAnnotation proto. +class LandmarksToFrameAnnotationCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + ::mediapipe::Status Process(CalculatorContext* cc) override; +}; +REGISTER_CALCULATOR(LandmarksToFrameAnnotationCalculator); + +::mediapipe::Status LandmarksToFrameAnnotationCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + + if (cc->Inputs().HasTag(kInputLandmarksTag)) { + cc->Inputs().Tag(kInputLandmarksTag).Set(); + } + + if (cc->Outputs().HasTag(kOutputFrameAnnotationTag)) { + cc->Outputs().Tag(kOutputFrameAnnotationTag).Set(); + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status LandmarksToFrameAnnotationCalculator::Process( + CalculatorContext* cc) { + auto frame_annotation = absl::make_unique(); + auto* box_annotation = frame_annotation->add_annotations(); + + const auto& landmarks = + cc->Inputs().Tag(kInputLandmarksTag).Get(); + RET_CHECK_GT(landmarks.landmark_size(), 0) + << "Input landmark vector is empty."; + for (int i = 0; i < landmarks.landmark_size(); ++i) { + auto* point2d = box_annotation->add_keypoints()->mutable_point_2d(); + point2d->set_x(landmarks.landmark(i).x()); + point2d->set_y(landmarks.landmark(i).y()); + } + // Output + if (cc->Outputs().HasTag(kOutputFrameAnnotationTag)) { + cc->Outputs() + .Tag(kOutputFrameAnnotationTag) + .Add(frame_annotation.release(), cc->InputTimestamp()); + } + return ::mediapipe::OkStatus(); +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/object_occlusion_tracking.pbtxt b/mediapipe/graphs/object_detection_3d/object_occlusion_tracking.pbtxt new file mode 100644 index 0000000000..885f45f131 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/object_occlusion_tracking.pbtxt @@ -0,0 +1,189 @@ +# MediaPipe graph that performs box tracking with TensorFlow Lite on GPU. + +# Images coming into and out of the graph. +input_stream: "input_video" +input_stream: "input_width" +input_stream: "input_height" +output_stream: "output_video" + +# Throttles the images flowing downstream for flow control. It passes through +# the very first incoming image unaltered, and waits for downstream nodes +# (calculators and subgraphs) in the graph to finish their tasks before it +# passes through another image. All images that come in while waiting are +# dropped, limiting the number of in-flight images in most part of the graph to +# 1. This prevents the downstream nodes from queuing up incoming images and data +# excessively, which leads to increased latency and memory usage, unwanted in +# real-time mobile applications. It also eliminates unnecessarily computation, +# e.g., the output produced by a node may get dropped downstream if the +# subsequent nodes are still busy processing previous inputs. +node { + calculator: "FlowLimiterCalculator" + input_stream: "input_video" + input_stream: "FINISHED:box_rect" + input_stream_info: { + tag_index: "FINISHED" + back_edge: true + } + output_stream: "throttled_input_video" +} + +# Crops the image from the center to the size WIDTHxHEIGHT. +node: { + calculator: "ImageCroppingCalculator" + input_stream: "IMAGE_GPU:throttled_input_video" + output_stream: "IMAGE_GPU:throttled_input_video_4x3" + input_stream: "WIDTH:input_width" + input_stream: "HEIGHT:input_height" + node_options: { + [type.googleapis.com/mediapipe.ImageCroppingCalculatorOptions] { + border_mode: BORDER_REPLICATE + } + } +} + +# Caches a box-presence decision fed back from boxLandmarkSubgraph, and upon +# the arrival of the next input image sends out the cached decision with the +# timestamp replaced by that of the input image, essentially generating a packet +# that carries the previous box-presence decision. Note that upon the arrival +# of the very first input image, an empty packet is sent out to jump start the +# feedback loop. +node { + calculator: "PreviousLoopbackCalculator" + input_stream: "MAIN:throttled_input_video_4x3" + input_stream: "LOOP:box_presence" + input_stream_info: { + tag_index: "LOOP" + back_edge: true + } + output_stream: "PREV_LOOP:prev_box_presence" +} + +# Drops the incoming image if boxLandmarkSubgraph was able to identify box +# presence in the previous image. Otherwise, passes the incoming image through +# to trigger a new round of box detection in boxDetectionSubgraph. +node { + calculator: "GateCalculator" + input_stream: "throttled_input_video_4x3" + input_stream: "DISALLOW:prev_box_presence" + output_stream: "detection_input_video" + + node_options: { + [type.googleapis.com/mediapipe.GateCalculatorOptions] { + empty_packets_as_allow: true + } + } +} + +# Subgraph that performs 2D object detection. +node { + calculator: "ObjectDetectionOidV4Subgraph" + input_stream: "detection_input_video" + input_side_packet: "allowed_labels" + output_stream: "NORM_RECT:box_rect_from_object_detections" +} + +# Subgraph that localizes box landmarks (see subgraphs/box_landmark_gpu.pbtxt). +node { + calculator: "BoxLandmarkSubgraph" + input_stream: "IMAGE:throttled_input_video_4x3" + input_stream: "NORM_RECT:box_rect" + output_stream: "FRAME_ANNOTATION:lifted_objects" + output_stream: "NORM_RECT:box_rect_from_landmarks" + output_stream: "PRESENCE:box_presence" +} + +# Caches a box rectangle fed back from boxLandmarkSubgraph, and upon the +# arrival of the next input image sends out the cached rectangle with the +# timestamp replaced by that of the input image, essentially generating a packet +# that carries the previous box rectangle. Note that upon the arrival of the +# very first input image, an empty packet is sent out to jump start the +# feedback loop. +node { + calculator: "PreviousLoopbackCalculator" + input_stream: "MAIN:throttled_input_video_4x3" + input_stream: "LOOP:box_rect_from_landmarks" + input_stream_info: { + tag_index: "LOOP" + back_edge: true + } + output_stream: "PREV_LOOP:prev_box_rect_from_landmarks" +} + +# Merges a stream of box rectangles generated by boxDetectionSubgraph and that +# generated by boxLandmarkSubgraph into a single output stream by selecting +# between one of the two streams. The former is selected if the incoming packet +# is not empty, i.e., box detection is performed on the current image by +# boxDetectionSubgraph (because boxLandmarkSubgraph could not identify box +# presence in the previous image). Otherwise, the latter is selected, which is +# never empty because boxLandmarkSubgraphs processes all images (that went +# through FlowLimiterCaculator). +node { + calculator: "MergeCalculator" + input_stream: "box_rect_from_object_detections" + input_stream: "prev_box_rect_from_landmarks" + output_stream: "box_rect" +} + +# The rendering nodes: +# We are rendering two meshes: 1) a 3D bounding box, which we overlay directly +# on the texture, and 2) a virtual object, which we use as an occlusion mask. +# These models are designed using different tools, so we supply a transformation +# to bring both of them to the Objectron's coordinate system. + +# Creates a model matrices for the tracked object given the lifted 3D points. +# This calculator does two things: 1) Estimates object's pose (orientation, +# translation, and scale) from the 3D vertices, and +# 2) bring the object from the objectron's coordinate system to the renderer +# (OpenGL) coordinate system. Since the final goal is to render a mesh file on +# top of the object, we also supply a transformation to bring the mesh to the +# objectron's coordinate system, and rescale mesh to the unit size. +node { + calculator: "AnnotationsToModelMatricesCalculator" + input_stream: "ANNOTATIONS:lifted_objects" + output_stream: "MODEL_MATRICES:model_matrices" + node_options: { + [type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] { + # Re-scale the CAD model to the size of a unit box + model_scale: [0.04, 0.04, 0.04] + # Bring the box CAD model to objectron's coordinate system. This + # is equivalent of -pi/2 rotation along the y-axis (right-hand rule): + # Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitY()) + model_transformation: [0.0, 0.0, -1.0, 0.0] + model_transformation: [0.0, 1.0, 0.0, 0.0] + model_transformation: [1.0, 0.0, 0.0, 0.0] + model_transformation: [0.0, 0.0, 0.0, 1.0] + } + } +} + +# Compute the model matrices for the CAD model of the virtual object, to be used +# as an occlusion mask. The model will be rendered at the exact same location as +# the bounding box. +node { + calculator: "AnnotationsToModelMatricesCalculator" + input_stream: "ANNOTATIONS:lifted_objects" + input_side_packet: "MODEL_SCALE:model_scale" + input_side_packet: "MODEL_TRANSFORMATION:model_transformation" + output_stream: "MODEL_MATRICES:mask_model_matrices" +} + +# Render everything together. First we render the 3D bounding box animation, +# then we render the occlusion mask. +node: { + calculator: "GlAnimationOverlayCalculator" + input_stream: "VIDEO:throttled_input_video_4x3" + input_stream: "MODEL_MATRICES:model_matrices" + input_stream: "MASK_MODEL_MATRICES:mask_model_matrices" + output_stream: "output_video" + input_side_packet: "TEXTURE:box_texture" + input_side_packet: "ANIMATION_ASSET:box_asset_name" + input_side_packet: "MASK_TEXTURE:obj_texture" + input_side_packet: "MASK_ASSET:obj_asset_name" + node_options: { + [type.googleapis.com/mediapipe.GlAnimationOverlayCalculatorOptions] { + aspect_ratio: 0.75 + vertical_fov_degrees: 70. + animation_speed_fps: 25 + } + } +} diff --git a/mediapipe/graphs/object_detection_3d/chair_classic_occlusion_tracking.pbtxt b/mediapipe/graphs/object_detection_3d/object_occlusion_tracking_1stage.pbtxt similarity index 98% rename from mediapipe/graphs/object_detection_3d/chair_classic_occlusion_tracking.pbtxt rename to mediapipe/graphs/object_detection_3d/object_occlusion_tracking_1stage.pbtxt index 5700bb1adc..cda2efd736 100644 --- a/mediapipe/graphs/object_detection_3d/chair_classic_occlusion_tracking.pbtxt +++ b/mediapipe/graphs/object_detection_3d/object_occlusion_tracking_1stage.pbtxt @@ -99,7 +99,7 @@ node { [type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] { # Re-scale the CAD model to the size of a unit box model_scale: [0.15, 0.1, 0.15] - # Bring the shoe CAD model to Deep Pursuit 3D's coordinate system. This + # Bring the CAD model to Deep Pursuit 3D's coordinate system. This # is equivalent of -pi/2 rotation along the x-axis: # Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitX()) model_transformation: [1.0, 0.0, 0.0, 0.0] diff --git a/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt b/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt deleted file mode 100644 index 0889c1d5a3..0000000000 --- a/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt +++ /dev/null @@ -1,134 +0,0 @@ -# MediaPipe object detection 3D with tracking graph. - -# Images on GPU coming into and out of the graph. -input_stream: "input_video" -input_stream: "input_width" -input_stream: "input_height" -output_stream: "output_video" - -# Crops the image from the center to the size WIDTHxHEIGHT. -node: { - calculator: "ImageCroppingCalculator" - input_stream: "IMAGE_GPU:input_video" - output_stream: "IMAGE_GPU:input_video_4x3" - input_stream: "WIDTH:input_width" - input_stream: "HEIGHT:input_height" - node_options: { - [type.googleapis.com/mediapipe.ImageCroppingCalculatorOptions] { - border_mode: BORDER_REPLICATE - } - } -} - -# Creates a copy of the input_video stream. At the end of the graph, the -# GlAnimationOverlayCalculator will consume the input_video texture and draws -# on top of it. -node: { - calculator: "GlScalerCalculator" - input_stream: "VIDEO:input_video_4x3" - output_stream: "VIDEO:input_video_copy" -} - -# Resamples the images by specific frame rate. This calculator is used to -# control the frequecy of subsequent calculators/subgraphs, e.g. less power -# consumption for expensive process. -node { - calculator: "PacketResamplerCalculator" - input_stream: "DATA:input_video_copy" - output_stream: "DATA:sampled_input_video" - node_options: { - [type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] { - frame_rate: 5 - } - } -} - -node { - calculator: "ObjectronDetectionSubgraphGpu" - input_stream: "IMAGE_GPU:sampled_input_video" - output_stream: "ANNOTATIONS:objects" -} - -node { - calculator: "ObjectronTrackingSubgraphGpu" - input_stream: "FRAME_ANNOTATION:objects" - input_stream: "IMAGE_GPU:input_video_copy" - output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects" -} - -# The rendering nodes: -# We are rendering two meshes: 1) a 3D bounding box, which we overlay directly -# on the texture, and 2) a shoe CAD model, which we use as an occlusion mask. -# These models are designed using different tools, so we supply a transformation -# to bring both of them to the Objectron's coordinate system. - -# Creates a model matrices for the tracked object given the lifted 3D points. -# This calculator does two things: 1) Estimates object's pose (orientation, -# translation, and scale) from the 3D vertices, and -# 2) bring the object from the objectron's coordinate system to the renderer -# (OpenGL) coordinate system. Since the final goal is to render a mesh file on -# top of the object, we also supply a transformation to bring the mesh to the -# objectron's coordinate system, and rescale mesh to the unit size. -node { - calculator: "AnnotationsToModelMatricesCalculator" - input_stream: "ANNOTATIONS:lifted_tracked_objects" - output_stream: "MODEL_MATRICES:model_matrices" - node_options: { - [type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] { - # Re-scale the CAD model to the size of a unit box - model_scale: [0.05, 0.05, 0.05] - # Bring the box CAD model to objectron's coordinate system. This - # is equivalent of -pi/2 rotation along the y-axis (right-hand rule): - # Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitY()) - model_transformation: [0.0, 0.0, -1.0, 0.0] - model_transformation: [0.0, 1.0, 0.0, 0.0] - model_transformation: [1.0, 0.0, 0.0, 0.0] - model_transformation: [0.0, 0.0, 0.0, 1.0] - } - } -} - -# Compute the model matrices for the CAD model of the shoe, to be used as an -# occlusion mask. The model will be rendered at the exact same location as the -# bounding box. -node { - calculator: "AnnotationsToModelMatricesCalculator" - input_stream: "ANNOTATIONS:lifted_tracked_objects" - output_stream: "MODEL_MATRICES:mask_model_matrices" - #input_side_packet: "MODEL_SCALE:model_scale" - node_options: { - [type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] { - # Re-scale the CAD model to the size of a unit box - model_scale: [0.45, 0.25, 0.15] - # Bring the shoe CAD model to Deep Pursuit 3D's coordinate system. This - # is equivalent of -pi/2 rotation along the x-axis (right-hand rule): - # Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitX()) - model_transformation: [1.0, 0.0, 0.0, 0.0] - model_transformation: [0.0, 0.0, 1.0, 0.0] - model_transformation: [0.0, -1.0, 0.0, 0.0] - model_transformation: [0.0, 0.0, 0.0, 1.0] - } - } -} - -# Render everything together. First we render the 3D bounding box animation, -# then we render the occlusion mask. -node: { - calculator: "GlAnimationOverlayCalculator" - input_stream: "VIDEO:input_video_4x3" - input_stream: "MODEL_MATRICES:model_matrices" - input_stream: "MASK_MODEL_MATRICES:mask_model_matrices" - output_stream: "output_video" - input_side_packet: "TEXTURE:box_texture" - input_side_packet: "ANIMATION_ASSET:box_asset_name" - input_side_packet: "MASK_TEXTURE:obj_texture" - input_side_packet: "MASK_ASSET:obj_asset_name" - node_options: { - [type.googleapis.com/mediapipe.GlAnimationOverlayCalculatorOptions] { - # Output resolution is 480x640 with the aspect ratio of 0.75 - aspect_ratio: 0.75 - vertical_fov_degrees: 70. - animation_speed_fps: 25 - } - } -} diff --git a/mediapipe/graphs/object_detection_3d/subgraphs/BUILD b/mediapipe/graphs/object_detection_3d/subgraphs/BUILD index 763e7372a2..10d5b92e6a 100644 --- a/mediapipe/graphs/object_detection_3d/subgraphs/BUILD +++ b/mediapipe/graphs/object_detection_3d/subgraphs/BUILD @@ -50,3 +50,48 @@ mediapipe_simple_subgraph( "//mediapipe/graphs/object_detection_3d/calculators:lift_2d_frame_annotation_to_3d_calculator", ], ) + +mediapipe_simple_subgraph( + name = "box_landmark_gpu", + graph = "box_landmark_gpu.pbtxt", + register_as = "BoxLandmarkSubgraph", + deps = [ + "//mediapipe/calculators/core:split_vector_calculator", + "//mediapipe/calculators/image:image_cropping_calculator", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/tflite:tflite_converter_calculator", + "//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator", + "//mediapipe/calculators/tflite:tflite_inference_calculator", + "//mediapipe/calculators/tflite:tflite_tensors_to_floats_calculator", + "//mediapipe/calculators/tflite:tflite_tensors_to_landmarks_calculator", + "//mediapipe/calculators/util:detections_to_rects_calculator", + "//mediapipe/calculators/util:landmark_letterbox_removal_calculator", + "//mediapipe/calculators/util:landmark_projection_calculator", + "//mediapipe/calculators/util:landmarks_smoothing_calculator", + "//mediapipe/calculators/util:landmarks_to_detection_calculator", + "//mediapipe/calculators/util:rect_transformation_calculator", + "//mediapipe/calculators/util:thresholding_calculator", + "//mediapipe/graphs/object_detection_3d/calculators:frame_annotation_to_rect_calculator", + "//mediapipe/graphs/object_detection_3d/calculators:landmarks_to_frame_annotation_calculator", + "//mediapipe/graphs/object_detection_3d/calculators:lift_2d_frame_annotation_to_3d_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "object_detection_oid_v4_gpu", + graph = "object_detection_oid_v4_gpu.pbtxt", + register_as = "ObjectDetectionOidV4Subgraph", + deps = [ + "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/tflite:ssd_anchors_calculator", + "//mediapipe/calculators/tflite:tflite_converter_calculator", + "//mediapipe/calculators/tflite:tflite_inference_calculator", + "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", + "//mediapipe/calculators/util:detection_label_id_to_text_calculator", + "//mediapipe/calculators/util:detections_to_rects_calculator", + "//mediapipe/calculators/util:non_max_suppression_calculator", + "//mediapipe/calculators/util:rect_transformation_calculator", + "//mediapipe/graphs/object_detection_3d/calculators:filter_detection_calculator", + ], +) diff --git a/mediapipe/graphs/object_detection_3d/subgraphs/box_landmark_gpu.pbtxt b/mediapipe/graphs/object_detection_3d/subgraphs/box_landmark_gpu.pbtxt new file mode 100644 index 0000000000..b29c8b11c2 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/subgraphs/box_landmark_gpu.pbtxt @@ -0,0 +1,205 @@ +# MediaPipe Box landmark localization subgraph. + +type: "BoxLandmarkSubgraph" + +input_stream: "IMAGE:input_video" +input_stream: "NORM_RECT:box_rect" +output_stream: "FRAME_ANNOTATION:lifted_box" +output_stream: "NORM_RECT:box_rect_for_next_frame" +output_stream: "PRESENCE:box_presence" + +# Crops the rectangle that contains a box from the input image. +node { + calculator: "ImageCroppingCalculator" + input_stream: "IMAGE_GPU:input_video" + input_stream: "NORM_RECT:box_rect" + output_stream: "IMAGE_GPU:box_image" + node_options: { + [type.googleapis.com/mediapipe.ImageCroppingCalculatorOptions] { + border_mode: BORDER_REPLICATE + } + } +} + +# Transforms the input image on GPU to a 256x256 image. To scale the input +# image, the scale_mode option is set to FIT to preserve the aspect ratio, +# resulting in potential letterboxing in the transformed image. +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE_GPU:box_image" + output_stream: "IMAGE_GPU:transformed_box_image" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 224 + output_height: 224 + scale_mode: FIT + } + } +} + +# Converts the transformed input image on GPU into an image tensor stored as a +# TfLiteTensor. +node { + calculator: "TfLiteConverterCalculator" + input_stream: "IMAGE_GPU:transformed_box_image" + output_stream: "TENSORS_GPU:image_tensor" + node_options: { + [type.googleapis.com/mediapipe.TfLiteConverterCalculatorOptions] { + zero_center: false + } + } +} + +# Generates a single side packet containing a TensorFlow Lite op resolver that +# supports custom ops needed by the model used in this graph. +node { + calculator: "TfLiteCustomOpResolverCalculator" + output_side_packet: "opresolver" +} + +# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "TfLiteInferenceCalculator" + input_stream: "TENSORS_GPU:image_tensor" + output_stream: "TENSORS:output_tensors" + input_side_packet: "CUSTOM_OP_RESOLVER:opresolver" + node_options: { + [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { + model_path: "object_detection_3d.tflite" + use_gpu: true + } + } +} + +# Splits a vector of tensors into multiple vectors. +node { + calculator: "SplitTfLiteTensorVectorCalculator" + input_stream: "output_tensors" + output_stream: "landmark_tensors" + output_stream: "box_flag_tensor" + node_options: { + [type.googleapis.com/mediapipe.SplitVectorCalculatorOptions] { + ranges: { begin: 0 end: 1 } + ranges: { begin: 1 end: 2 } + } + } +} + +# Converts the box-flag tensor into a float that represents the confidence +# score of box presence. +node { + calculator: "TfLiteTensorsToFloatsCalculator" + input_stream: "TENSORS:box_flag_tensor" + output_stream: "FLOAT:box_presence_score" +} + +# Applies a threshold to the confidence score to determine whether a box is +# present. +node { + calculator: "ThresholdingCalculator" + input_stream: "FLOAT:box_presence_score" + output_stream: "FLAG:box_presence" + node_options: { + [type.googleapis.com/mediapipe.ThresholdingCalculatorOptions] { + threshold: 0.99 + } + } +} + +# Decodes the landmark tensors into a list of landmarks, where the landmark +# coordinates are normalized by the size of the input image to the model. +node { + calculator: "TfLiteTensorsToLandmarksCalculator" + input_stream: "TENSORS:landmark_tensors" + output_stream: "NORM_LANDMARKS:landmarks" + node_options: { + [type.googleapis.com/mediapipe.TfLiteTensorsToLandmarksCalculatorOptions] { + num_landmarks: 9 + input_image_width: 224 + input_image_height: 224 + } + } +} + +# Adjusts landmarks (already normalized to [0.f, 1.f]) on the letterboxed box +# image (after image transformation with the FIT scale mode) to the +# corresponding locations on the same image with the letterbox removed (box +# image before image transformation). +node { + calculator: "LandmarkLetterboxRemovalCalculator" + input_stream: "LANDMARKS:landmarks" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "LANDMARKS:scaled_landmarks" +} + +# Projects the landmarks from the cropped box image to the corresponding +# locations on the full image before cropping (input to the graph). +node { + calculator: "LandmarkProjectionCalculator" + input_stream: "NORM_LANDMARKS:scaled_landmarks" + input_stream: "NORM_RECT:box_rect" + output_stream: "NORM_LANDMARKS:box_landmarks" +} + +# Extracts image size from the input images. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "SIZE:image_size" +} + +# Smooth predicted landmarks coordinates. +node { + calculator: "LandmarksSmoothingCalculator" + input_stream: "NORM_LANDMARKS:box_landmarks" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "NORM_FILTERED_LANDMARKS:box_landmarks_filtered" + node_options: { + [type.googleapis.com/mediapipe.LandmarksSmoothingCalculatorOptions] { + velocity_filter: { + window_size: 10 + velocity_scale: 7.5 + } + } + } +} + +# Convert box landmarks to frame annotation. +node { + calculator: "LandmarksToFrameAnnotationCalculator" + input_stream: "LANDMARKS:box_landmarks_filtered" + output_stream: "FRAME_ANNOTATION:box_annotation" +} + +# Lift the 2D landmarks to 3D using EPnP algorithm. +node { + calculator: "Lift2DFrameAnnotationTo3DCalculator" + input_stream: "FRAME_ANNOTATION:box_annotation" + output_stream: "LIFTED_FRAME_ANNOTATION:lifted_box" +} + +# Get rotated rectangle from lifted box. +node { + calculator: "FrameAnnotationToRectCalculator" + input_stream: "FRAME_ANNOTATION:lifted_box" + output_stream: "NORM_RECT:rect_from_box" +} + +# Expands the box rectangle so that in the next video frame it's likely to +# still contain the box even with some motion. +node { + calculator: "RectTransformationCalculator" + input_stream: "NORM_RECT:rect_from_box" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "box_rect_for_next_frame" + node_options: { + [type.googleapis.com/mediapipe.RectTransformationCalculatorOptions] { + scale_x: 1.5 + scale_y: 1.5 + square_long: true + } + } +} diff --git a/mediapipe/graphs/object_detection_3d/subgraphs/object_detection_oid_v4_gpu.pbtxt b/mediapipe/graphs/object_detection_3d/subgraphs/object_detection_oid_v4_gpu.pbtxt new file mode 100644 index 0000000000..7dc01e6e16 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/subgraphs/object_detection_oid_v4_gpu.pbtxt @@ -0,0 +1,177 @@ +# MediaPipe Objectron object bounding box detection subgraph. + +type: "ObjectDetectionSubgraph" + +input_stream: "input_video" +input_side_packet: "allowed_labels" +output_stream: "NORM_RECT:box_rect_from_object_detections" + +# Transforms the input image on GPU to a 320x320 image. To scale the image, by +# default it uses the STRETCH scale mode that maps the entire input image to the +# entire transformed image. As a result, image aspect ratio may be changed and +# objects in the image may be deformed (stretched or squeezed), but the object +# detection model used in this graph is agnostic to that deformation. +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "IMAGE_GPU:transformed_input_video" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 300 + output_height: 300 + } + } +} + +# Converts the transformed input image on GPU into an image tensor stored as a +# TfLiteTensor. +node { + calculator: "TfLiteConverterCalculator" + input_stream: "IMAGE_GPU:transformed_input_video" + output_stream: "TENSORS_GPU:image_tensor" +} + +# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "TfLiteInferenceCalculator" + input_stream: "TENSORS_GPU:image_tensor" + output_stream: "TENSORS_GPU:detection_tensors" + node_options: { + [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { + model_path: "object_detection_ssd_mobilenetv2_oidv4_fp16.tflite" + } + } +} + +# Generates a single side packet containing a vector of SSD anchors based on +# the specification in the options. +node { + calculator: "SsdAnchorsCalculator" + output_side_packet: "anchors" + node_options: { + [type.googleapis.com/mediapipe.SsdAnchorsCalculatorOptions] { + num_layers: 6 + min_scale: 0.2 + max_scale: 0.95 + input_size_height: 300 + input_size_width: 300 + anchor_offset_x: 0.5 + anchor_offset_y: 0.5 + strides: 16 + strides: 32 + strides: 64 + strides: 128 + strides: 256 + strides: 512 + aspect_ratios: 1.0 + aspect_ratios: 2.0 + aspect_ratios: 0.5 + aspect_ratios: 3.0 + aspect_ratios: 0.3333 + reduce_boxes_in_lowest_layer: true + } + } +} + +# Decodes the detection tensors generated by the TensorFlow Lite model, based on +# the SSD anchors and the specification in the options, into a vector of +# detections. Each detection describes a detected object. +node { + calculator: "TfLiteTensorsToDetectionsCalculator" + input_stream: "TENSORS_GPU:detection_tensors" + input_side_packet: "ANCHORS:anchors" + output_stream: "DETECTIONS:detections" + node_options: { + [type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] { + num_classes: 195 + num_boxes: 1917 + num_coords: 4 + ignore_classes: 0 + sigmoid_score: true + apply_exponential_on_box_size: true + x_scale: 10.0 + y_scale: 10.0 + h_scale: 5.0 + w_scale: 5.0 + min_score_thresh: 0.6 + } + } +} + +# Performs non-max suppression to remove excessive detections. +node { + calculator: "NonMaxSuppressionCalculator" + input_stream: "detections" + output_stream: "suppressed_detections" + node_options: { + [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { + min_suppression_threshold: 0.4 + max_num_detections: 1 + overlap_type: INTERSECTION_OVER_UNION + return_empty_detections: true + } + } +} + +# Maps detection label IDs to the corresponding label text. The label map is +# provided in the label_map_path option. +node { + calculator: "DetectionLabelIdToTextCalculator" + input_stream: "suppressed_detections" + output_stream: "labeled_detections" + node_options: { + [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { + label_map_path: "object_detection_oidv4_labelmap.pbtxt" + } + } +} + +node { + calculator: "FilterDetectionCalculator" + input_stream: "DETECTIONS:labeled_detections" + output_stream: "DETECTIONS:filtered_detections" + input_side_packet: "LABELS_CSV:allowed_labels" + node_options: { + [type.googleapis.com/mediapipe.FilterDetectionCalculatorOptions]: { + min_score: 0.4 + } + } +} + +# Extracts image size from the input images. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "SIZE:image_size" +} + +# Converts results of box detection into a rectangle (normalized by image size) +# that encloses the box. +node { + calculator: "DetectionsToRectsCalculator" + input_stream: "DETECTIONS:filtered_detections" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "NORM_RECT:box_rect" + node_options: { + [type.googleapis.com/mediapipe.DetectionsToRectsCalculatorOptions] { + output_zero_rect_for_empty_detections: true + } + } +} + +# Expands the rectangle that contains the box so that it's likely to cover the +# entire box. +node { + calculator: "RectTransformationCalculator" + input_stream: "NORM_RECT:box_rect" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "box_rect_from_object_detections" + node_options: { + [type.googleapis.com/mediapipe.RectTransformationCalculatorOptions] { + scale_x: 1.5 + scale_y: 1.5 + } + } +} diff --git a/mediapipe/java/com/google/mediapipe/framework/proguard.pgcfg b/mediapipe/java/com/google/mediapipe/framework/proguard.pgcfg index 699d36eee3..34dce63d09 100644 --- a/mediapipe/java/com/google/mediapipe/framework/proguard.pgcfg +++ b/mediapipe/java/com/google/mediapipe/framework/proguard.pgcfg @@ -24,3 +24,6 @@ -keep public class com.google.mediapipe.framework.MediaPipeException { (int, byte[]); } + +# Required to use PacketCreator#createProto +-keep class com.google.mediapipe.framework.ProtoUtil$SerializedMessage { *; } diff --git a/mediapipe/models/hand_landmark.tflite b/mediapipe/models/hand_landmark.tflite deleted file mode 100644 index 886159fc85..0000000000 Binary files a/mediapipe/models/hand_landmark.tflite and /dev/null differ diff --git a/mediapipe/models/object_detection_3d_camera.tflite b/mediapipe/models/object_detection_3d_camera.tflite new file mode 100644 index 0000000000..14cb826b1b Binary files /dev/null and b/mediapipe/models/object_detection_3d_camera.tflite differ diff --git a/mediapipe/models/object_detection_3d_chair.tflite b/mediapipe/models/object_detection_3d_chair.tflite index 718dc97664..3a23dfdcea 100644 Binary files a/mediapipe/models/object_detection_3d_chair.tflite and b/mediapipe/models/object_detection_3d_chair.tflite differ diff --git a/mediapipe/models/object_detection_3d_chair_1stage.tflite b/mediapipe/models/object_detection_3d_chair_1stage.tflite new file mode 100644 index 0000000000..718dc97664 Binary files /dev/null and b/mediapipe/models/object_detection_3d_chair_1stage.tflite differ diff --git a/mediapipe/models/object_detection_3d_cup.tflite b/mediapipe/models/object_detection_3d_cup.tflite new file mode 100644 index 0000000000..1a7a5d304f Binary files /dev/null and b/mediapipe/models/object_detection_3d_cup.tflite differ diff --git a/mediapipe/models/object_detection_3d_sneakers.tflite b/mediapipe/models/object_detection_3d_sneakers.tflite index 2077114336..d64234d598 100644 Binary files a/mediapipe/models/object_detection_3d_sneakers.tflite and b/mediapipe/models/object_detection_3d_sneakers.tflite differ diff --git a/mediapipe/models/object_detection_3d_sneakers_1stage.tflite b/mediapipe/models/object_detection_3d_sneakers_1stage.tflite new file mode 100644 index 0000000000..2077114336 Binary files /dev/null and b/mediapipe/models/object_detection_3d_sneakers_1stage.tflite differ diff --git a/mediapipe/models/object_detection_oidv4_labelmap.pbtxt b/mediapipe/models/object_detection_oidv4_labelmap.pbtxt new file mode 100644 index 0000000000..7d93e64021 --- /dev/null +++ b/mediapipe/models/object_detection_oidv4_labelmap.pbtxt @@ -0,0 +1,195 @@ +??? +Container +Ambulance +Ladder +Toothbrush +Sink +Cassette deck +Beer +Parking meter +Traffic light +Washing machine +Sunglasses +Ball +Backpack +Bicycle +Home appliance +Boat +Boot +Headphones +Bus +Screwdriver +Laptop +Teapot +Person +Swimwear +Balloon +Wrench +Vehicle registration plate +Lantern +Toaster +Flashlight +Billboard +Limousine +Necklace +Scissors +Stairs +Computer keyboard +Printer +Traffic sign +Chair +Poster +Fire hydrant +Land vehicle +Cabinetry +Suitcase +Snowmobile +Clock +Cattle +Cello +Desk +Cat +Computer mouse +Calculator +Computer monitor +Box +Stapler +Studio couch +Drum +Dice +Oven +Couch +Whiteboard +Door +Hat +Eraser +Tin can +Mug +Can opener +Goggles +Roller skates +Coffee cup +Cutting board +Blender +Stop sign +Volleyball +Vase +Slow cooker +Wardrobe +Paper towel +Sun hat +Tree house +Gas stove +Salt and pepper shakers +Mechanical fan +Fax +Nightstand +Barrel +Guitar +Pillow +Stationary bicycle +Hammer +Ceiling fan +Sofa bed +Sandal +Bicycle helmet +Bed +Kettle +Hair dryer +Kitchenware +Bookcase +Refrigerator +Alarm clock +Filing cabinet +Table +Knife +Bottle +Dumbbell +Bowl +Billiard table +Motorcycle +Frying pan +Bathroom cabinet +Plate +Mobile phone +Table tennis racket +Musical keyboard +Scoreboard +Briefcase +Kitchen knife +Piano +Pumpkin +Infant bed +Mixer +Cupboard +Digital clock +Rifle +Skateboard +High heels +Snowboard +Sword +Training bench +Coffee table +Television +Trombone +Tank +Telephone +Trumpet +Train +Picnic basket +Football helmet +Truck +Measuring cup +Coffeemaker +Violin +Vehicle +Wine +Wheel +Jug +Toilet +Clothing +Footwear +Tablet computer +Dog +Book +Candle +Hand dryer +Soap dispenser +Furniture +Airplane +Spoon +Bench +Window +Closet +Fork +Lamp +Camera +Racket +Human face +Unicycle +Flowerpot +Drawer +Stool +Microwave oven +Shelf +Handgun +Van +Corded phone +Tennis racket +Wall clock +Kitchen & dining room table +Pressure cooker +Kitchen appliance +Tire +Luggage and bags +Microphone +Glasses +Pen +Car +Aircraft +Dishwasher +Binoculars +Rays and skates +Remote control +Wheelchair +Helmet diff --git a/mediapipe/models/object_detection_ssd_mobilenetv2_oidv4_fp16.tflite b/mediapipe/models/object_detection_ssd_mobilenetv2_oidv4_fp16.tflite new file mode 100644 index 0000000000..fa6ad878d3 Binary files /dev/null and b/mediapipe/models/object_detection_ssd_mobilenetv2_oidv4_fp16.tflite differ diff --git a/mediapipe/models/palm_detection.tflite b/mediapipe/models/palm_detection.tflite deleted file mode 100644 index 94c984cbff..0000000000 Binary files a/mediapipe/models/palm_detection.tflite and /dev/null differ diff --git a/mediapipe/models/palm_detection_labelmap.txt b/mediapipe/models/palm_detection_labelmap.txt deleted file mode 100644 index f3bf607d70..0000000000 --- a/mediapipe/models/palm_detection_labelmap.txt +++ /dev/null @@ -1 +0,0 @@ -Palm diff --git a/mediapipe/modules/README.md b/mediapipe/modules/README.md index c38ff9a50b..d38744bc12 100644 --- a/mediapipe/modules/README.md +++ b/mediapipe/modules/README.md @@ -7,7 +7,10 @@ Each module (represented as a subfolder) provides subgraphs and corresponding re | Module | Description | | :--- | :--- | | [`face_detection`](face_detection/README.md) | Subgraphs to detect faces. | +| [`face_geometry`](face_geometry/README.md) | Subgraphs to extract face geometry. | | [`face_landmark`](face_landmark/README.md) | Subgraphs to detect and track face landmarks. | +| [`hand_landmark`](hand_landmark/README.md) | Subgraphs to detect and track hand landmarks. | | [`iris_landmark`](iris_landmark/README.md) | Subgraphs to detect iris landmarks. | +| [`palm_detection`](palm_detection/README.md) | Subgraphs to detect palms/hands. | | [`pose_detection`](pose_detection/README.md) | Subgraphs to detect poses. | | [`pose_landmark`](pose_landmark/README.md) | Subgraphs to detect and track pose landmarks. | diff --git a/mediapipe/modules/face_detection/BUILD b/mediapipe/modules/face_detection/BUILD index bb576a9873..0af70e2ef0 100644 --- a/mediapipe/modules/face_detection/BUILD +++ b/mediapipe/modules/face_detection/BUILD @@ -26,11 +26,10 @@ mediapipe_simple_subgraph( graph = "face_detection_front_cpu.pbtxt", register_as = "FaceDetectionFrontCpu", deps = [ - "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator", + "//mediapipe/calculators/tensor:tensors_to_detections_calculator", "//mediapipe/calculators/tflite:ssd_anchors_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", "//mediapipe/calculators/util:detection_letterbox_removal_calculator", "//mediapipe/calculators/util:non_max_suppression_calculator", ], @@ -41,11 +40,10 @@ mediapipe_simple_subgraph( graph = "face_detection_front_gpu.pbtxt", register_as = "FaceDetectionFrontGpu", deps = [ - "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator", + "//mediapipe/calculators/tensor:tensors_to_detections_calculator", "//mediapipe/calculators/tflite:ssd_anchors_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", "//mediapipe/calculators/util:detection_letterbox_removal_calculator", "//mediapipe/calculators/util:non_max_suppression_calculator", ], diff --git a/mediapipe/modules/face_detection/face_detection_front_cpu.pbtxt b/mediapipe/modules/face_detection/face_detection_front_cpu.pbtxt index fda86fc502..2e0975ce9a 100644 --- a/mediapipe/modules/face_detection/face_detection_front_cpu.pbtxt +++ b/mediapipe/modules/face_detection/face_detection_front_cpu.pbtxt @@ -24,41 +24,36 @@ input_stream: "IMAGE:image" # this packet so that they don't wait for it unnecessarily. output_stream: "DETECTIONS:detections" -# Transforms the input image on CPU to a 128x128 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio -# (what is expected by the corresponding face detection model), resulting in -# potential letterboxing in the transformed image. +# Transforms the input image into a 128x128 tensor while keeping the aspect +# ratio (what is expected by the corresponding face detection model), resulting +# in potential letterboxing in the transformed image. node: { - calculator: "ImageTransformationCalculator" + calculator: "ImageToTensorCalculator" input_stream: "IMAGE:image" - output_stream: "IMAGE:transformed_image" + output_stream: "TENSORS:input_tensors" output_stream: "LETTERBOX_PADDING:letterbox_padding" options: { - [mediapipe.ImageTransformationCalculatorOptions.ext] { - output_width: 128 - output_height: 128 - scale_mode: FIT + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 128 + output_tensor_height: 128 + keep_aspect_ratio: true + output_tensor_float_range { + min: -1.0 + max: 1.0 + } } } } -# Converts the transformed input image on CPU into an image tensor stored as a -# TfLiteTensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE:transformed_image" - output_stream: "TENSORS:input_tensors" -} - # Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a # vector of tensors representing, for instance, detection boxes/keypoints and # scores. node { - calculator: "TfLiteInferenceCalculator" + calculator: "InferenceCalculator" input_stream: "TENSORS:input_tensors" output_stream: "TENSORS:detection_tensors" options: { - [mediapipe.TfLiteInferenceCalculatorOptions.ext] { + [mediapipe.InferenceCalculatorOptions.ext] { model_path: "mediapipe/modules/face_detection/face_detection_front.tflite" delegate { xnnpack {} } } @@ -93,12 +88,12 @@ node { # the SSD anchors and the specification in the options, into a vector of # detections. Each detection describes a detected object. node { - calculator: "TfLiteTensorsToDetectionsCalculator" + calculator: "TensorsToDetectionsCalculator" input_stream: "TENSORS:detection_tensors" input_side_packet: "ANCHORS:anchors" output_stream: "DETECTIONS:unfiltered_detections" options: { - [mediapipe.TfLiteTensorsToDetectionsCalculatorOptions.ext] { + [mediapipe.TensorsToDetectionsCalculatorOptions.ext] { num_classes: 1 num_boxes: 896 num_coords: 16 diff --git a/mediapipe/modules/face_detection/face_detection_front_gpu.pbtxt b/mediapipe/modules/face_detection/face_detection_front_gpu.pbtxt index 74bc46dd33..df2b045511 100644 --- a/mediapipe/modules/face_detection/face_detection_front_gpu.pbtxt +++ b/mediapipe/modules/face_detection/face_detection_front_gpu.pbtxt @@ -1,5 +1,5 @@ -# MediaPipe graph to detect faces. (GPU input, and inference is executed on -# GPU.) +# MediaPipe graph to detect faces. (CPU input, and inference is executed on +# CPU.) # # It is required that "face_detection_front.tflite" is available at # "mediapipe/modules/face_detection/face_detection_front.tflite" @@ -24,41 +24,37 @@ input_stream: "IMAGE:image" # this packet so that they don't wait for it unnecessarily. output_stream: "DETECTIONS:detections" -# Transforms the input image on GPU to a 128x128 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio -# (what is expected by the corresponding face detection model), resulting in -# potential letterboxing in the transformed image. +# Transforms the input image into a 128x128 tensor while keeping the aspect +# ratio (what is expected by the corresponding face detection model), resulting +# in potential letterboxing in the transformed image. node: { - calculator: "ImageTransformationCalculator" + calculator: "ImageToTensorCalculator" input_stream: "IMAGE_GPU:image" - output_stream: "IMAGE_GPU:transformed_image" + output_stream: "TENSORS:input_tensors" output_stream: "LETTERBOX_PADDING:letterbox_padding" options: { - [mediapipe.ImageTransformationCalculatorOptions.ext] { - output_width: 128 - output_height: 128 - scale_mode: FIT + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 128 + output_tensor_height: 128 + keep_aspect_ratio: true + output_tensor_float_range { + min: -1.0 + max: 1.0 + } + gpu_origin: TOP_LEFT } } } -# Converts the transformed input image on GPU into an image tensor stored as a -# TfLiteTensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE_GPU:transformed_image" - output_stream: "TENSORS_GPU:input_tensors" -} - # Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a # vector of tensors representing, for instance, detection boxes/keypoints and # scores. node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS_GPU:input_tensors" - output_stream: "TENSORS_GPU:detection_tensors" + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensors" + output_stream: "TENSORS:detection_tensors" options: { - [mediapipe.TfLiteInferenceCalculatorOptions.ext] { + [mediapipe.InferenceCalculatorOptions.ext] { model_path: "mediapipe/modules/face_detection/face_detection_front.tflite" } } @@ -92,12 +88,12 @@ node { # the SSD anchors and the specification in the options, into a vector of # detections. Each detection describes a detected object. node { - calculator: "TfLiteTensorsToDetectionsCalculator" - input_stream: "TENSORS_GPU:detection_tensors" + calculator: "TensorsToDetectionsCalculator" + input_stream: "TENSORS:detection_tensors" input_side_packet: "ANCHORS:anchors" output_stream: "DETECTIONS:unfiltered_detections" options: { - [mediapipe.TfLiteTensorsToDetectionsCalculatorOptions.ext] { + [mediapipe.TensorsToDetectionsCalculatorOptions.ext] { num_classes: 1 num_boxes: 896 num_coords: 16 diff --git a/mediapipe/modules/face_landmark/BUILD b/mediapipe/modules/face_landmark/BUILD index ef31d4fc04..b72e8147fa 100644 --- a/mediapipe/modules/face_landmark/BUILD +++ b/mediapipe/modules/face_landmark/BUILD @@ -28,12 +28,10 @@ mediapipe_simple_subgraph( deps = [ "//mediapipe/calculators/core:gate_calculator", "//mediapipe/calculators/core:split_vector_calculator", - "//mediapipe/calculators/image:image_cropping_calculator", - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_floats_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_landmarks_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator", + "//mediapipe/calculators/tensor:tensors_to_floats_calculator", + "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator", "//mediapipe/calculators/util:landmark_projection_calculator", "//mediapipe/calculators/util:thresholding_calculator", ], @@ -46,12 +44,10 @@ mediapipe_simple_subgraph( deps = [ "//mediapipe/calculators/core:gate_calculator", "//mediapipe/calculators/core:split_vector_calculator", - "//mediapipe/calculators/image:image_cropping_calculator", - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_floats_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_landmarks_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator", + "//mediapipe/calculators/tensor:tensors_to_floats_calculator", + "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator", "//mediapipe/calculators/util:landmark_projection_calculator", "//mediapipe/calculators/util:thresholding_calculator", ], @@ -74,6 +70,7 @@ mediapipe_simple_subgraph( "//mediapipe/calculators/image:image_properties_calculator", "//mediapipe/calculators/util:association_norm_rect_calculator", "//mediapipe/calculators/util:collection_has_min_size_calculator", + "//mediapipe/calculators/util:logic_calculator", "//mediapipe/modules/face_detection:face_detection_front_cpu", ], ) @@ -95,6 +92,7 @@ mediapipe_simple_subgraph( "//mediapipe/calculators/image:image_properties_calculator", "//mediapipe/calculators/util:association_norm_rect_calculator", "//mediapipe/calculators/util:collection_has_min_size_calculator", + "//mediapipe/calculators/util:logic_calculator", "//mediapipe/modules/face_detection:face_detection_front_gpu", ], ) diff --git a/mediapipe/modules/face_landmark/face_landmark_cpu.pbtxt b/mediapipe/modules/face_landmark/face_landmark_cpu.pbtxt index 66ecf60d83..018d3f4d5f 100644 --- a/mediapipe/modules/face_landmark/face_landmark_cpu.pbtxt +++ b/mediapipe/modules/face_landmark/face_landmark_cpu.pbtxt @@ -28,51 +28,33 @@ input_stream: "ROI:roi" # the absence of this packet so that they don't wait for it unnecessarily. output_stream: "LANDMARKS:face_landmarks" -# Crops the input image to the region of interest. -node { - calculator: "ImageCroppingCalculator" +# Transforms the input image into a 192x192 tensor. +node: { + calculator: "ImageToTensorCalculator" input_stream: "IMAGE:image" input_stream: "NORM_RECT:roi" - output_stream: "IMAGE:face_region" + output_stream: "TENSORS:input_tensors" options: { - [mediapipe.ImageCroppingCalculatorOptions.ext] { - border_mode: BORDER_REPLICATE + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 192 + output_tensor_height: 192 + output_tensor_float_range { + min: 0.0 + max: 1.0 + } } } } -# Transforms the input image on CPU to a 192x192 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE:face_region" - output_stream: "IMAGE:transformed_face_region" - options: { - [mediapipe.ImageTransformationCalculatorOptions.ext] { - output_width: 192 - output_height: 192 - } - } -} - -# Converts the transformed input image on CPU into an image tensor stored as a -# TfLiteTensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE:transformed_face_region" - output_stream: "TENSORS:input_tensor" -} - # Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a # vector of tensors representing, for instance, detection boxes/keypoints and # scores. node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS:input_tensor" + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensors" output_stream: "TENSORS:output_tensors" options: { - [mediapipe.TfLiteInferenceCalculatorOptions.ext] { + [mediapipe.InferenceCalculatorOptions.ext] { model_path: "mediapipe/modules/face_landmark/face_landmark.tflite" delegate { xnnpack {} } } @@ -81,7 +63,7 @@ node { # Splits a vector of tensors into multiple vectors. node { - calculator: "SplitTfLiteTensorVectorCalculator" + calculator: "SplitTensorVectorCalculator" input_stream: "output_tensors" output_stream: "landmark_tensors" output_stream: "face_flag_tensor" @@ -96,7 +78,7 @@ node { # Converts the face-flag tensor into a float that represents the confidence # score of face presence. node { - calculator: "TfLiteTensorsToFloatsCalculator" + calculator: "TensorsToFloatsCalculator" input_stream: "TENSORS:face_flag_tensor" output_stream: "FLOAT:face_presence_score" } @@ -125,11 +107,11 @@ node { # Decodes the landmark tensors into a vector of lanmarks, where the landmark # coordinates are normalized by the size of the input image to the model. node { - calculator: "TfLiteTensorsToLandmarksCalculator" + calculator: "TensorsToLandmarksCalculator" input_stream: "TENSORS:ensured_landmark_tensors" output_stream: "NORM_LANDMARKS:landmarks" options: { - [mediapipe.TfLiteTensorsToLandmarksCalculatorOptions.ext] { + [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { num_landmarks: 468 input_image_width: 192 input_image_height: 192 diff --git a/mediapipe/modules/face_landmark/face_landmark_front_cpu.pbtxt b/mediapipe/modules/face_landmark/face_landmark_front_cpu.pbtxt index cdd8a03f26..8245c85175 100644 --- a/mediapipe/modules/face_landmark/face_landmark_front_cpu.pbtxt +++ b/mediapipe/modules/face_landmark/face_landmark_front_cpu.pbtxt @@ -26,6 +26,10 @@ input_stream: "IMAGE:image" # Max number of faces to detect/track. (int) input_side_packet: "NUM_FACES:num_faces" +# Whether face detection can be skipped when face regions can already be +# approximated from face landmarks on the previous frame. +input_side_packet: "CAN_SKIP_DETECTION:can_skip_detection" + # Collection of detected/predicted faces, each represented as a list of 468 face # landmarks. (std::vector) # NOTE: there will not be an output packet in the LANDMARKS stream for this @@ -54,12 +58,22 @@ node { } # Drops the incoming image if FaceLandmarkCpu was able to identify face presence -# in the previous image. Otherwise, passes the incoming image through to trigger -# a new round of face detection in FaceDetectionFrontCpu. +# in the previous image and skipping face detection is enabled. Otherwise, +# passes the incoming image through to trigger a new round of face detection +# in FaceDetectionFrontCpu. +node { + calculator: "LogicCalculator" + options: { + [mediapipe.LogicCalculatorOptions.ext] { op: AND } + } + input_side_packet: "can_skip_detection" + input_stream: "prev_has_enough_faces" + output_stream: "skip_detection" +} node { calculator: "GateCalculator" input_stream: "image" - input_stream: "DISALLOW:prev_has_enough_faces" + input_stream: "DISALLOW:skip_detection" output_stream: "gated_image" options: { [mediapipe.GateCalculatorOptions.ext] { @@ -67,6 +81,12 @@ node { } } } +node { + calculator: "GateCalculator" + input_stream: "prev_face_rects_from_landmarks" + input_stream: "ALLOW:skip_detection" + output_stream: "gated_prev_face_rects_from_landmarks" +} # Detects faces. node { @@ -129,7 +149,7 @@ node { # overlapping regions based on the specified min_similarity_threshold. node { calculator: "AssociationNormRectCalculator" - input_stream: "prev_face_rects_from_landmarks" + input_stream: "gated_prev_face_rects_from_landmarks" input_stream: "face_rects_from_detections" output_stream: "face_rects" options: { diff --git a/mediapipe/modules/face_landmark/face_landmark_front_gpu.pbtxt b/mediapipe/modules/face_landmark/face_landmark_front_gpu.pbtxt index d06aff1dfb..a008717f6c 100644 --- a/mediapipe/modules/face_landmark/face_landmark_front_gpu.pbtxt +++ b/mediapipe/modules/face_landmark/face_landmark_front_gpu.pbtxt @@ -26,6 +26,10 @@ input_stream: "IMAGE:image" # Max number of faces to detect/track. (int) input_side_packet: "NUM_FACES:num_faces" +# Whether face detection can be skipped when face regions can already be +# approximated from face landmarks on the previous frame. +input_side_packet: "CAN_SKIP_DETECTION:can_skip_detection" + # Collection of detected/predicted faces, each represented as a list of 468 face # landmarks. (std::vector) # NOTE: there will not be an output packet in the LANDMARKS stream for this @@ -54,12 +58,22 @@ node { } # Drops the incoming image if FaceLandmarkGpu was able to identify face presence -# in the previous image. Otherwise, passes the incoming image through to trigger -# a new round of face detection in FaceDetectionFrontGpu. +# in the previous image and skipping face detection is enabled. Otherwise, +# passes the incoming image through to trigger a new round of face detection +# in FaceDetectionFrontGpu. +node { + calculator: "LogicCalculator" + options: { + [mediapipe.LogicCalculatorOptions.ext] { op: AND } + } + input_side_packet: "can_skip_detection" + input_stream: "prev_has_enough_faces" + output_stream: "skip_detection" +} node { calculator: "GateCalculator" input_stream: "image" - input_stream: "DISALLOW:prev_has_enough_faces" + input_stream: "DISALLOW:skip_detection" output_stream: "gated_image" options: { [mediapipe.GateCalculatorOptions.ext] { @@ -67,6 +81,12 @@ node { } } } +node { + calculator: "GateCalculator" + input_stream: "prev_face_rects_from_landmarks" + input_stream: "ALLOW:skip_detection" + output_stream: "gated_prev_face_rects_from_landmarks" +} # Detects faces. node { @@ -129,7 +149,7 @@ node { # overlapping regions based on the specified min_similarity_threshold. node { calculator: "AssociationNormRectCalculator" - input_stream: "prev_face_rects_from_landmarks" + input_stream: "gated_prev_face_rects_from_landmarks" input_stream: "face_rects_from_detections" output_stream: "face_rects" options: { diff --git a/mediapipe/modules/face_landmark/face_landmark_gpu.pbtxt b/mediapipe/modules/face_landmark/face_landmark_gpu.pbtxt index 3460d766d3..a606166e77 100644 --- a/mediapipe/modules/face_landmark/face_landmark_gpu.pbtxt +++ b/mediapipe/modules/face_landmark/face_landmark_gpu.pbtxt @@ -1,5 +1,5 @@ -# MediaPipe graph to detect/predict face landmarks. (GPU input, and inference is -# executed on GPU.) +# MediaPipe graph to detect/predict face landmarks. (CPU input, and inference is +# executed on CPU.) # # It is required that "face_landmark.tflite" is available at # "mediapipe/modules/face_landmark/face_landmark.tflite" @@ -28,51 +28,34 @@ input_stream: "ROI:roi" # the absence of this packet so that they don't wait for it unnecessarily. output_stream: "LANDMARKS:face_landmarks" -# Crops the input image to the given region of interest. -node { - calculator: "ImageCroppingCalculator" +# Transforms the input image into a 192x192 tensor. +node: { + calculator: "ImageToTensorCalculator" input_stream: "IMAGE_GPU:image" input_stream: "NORM_RECT:roi" - output_stream: "IMAGE_GPU:face_region" + output_stream: "TENSORS:input_tensors" options: { - [mediapipe.ImageCroppingCalculatorOptions.ext] { - border_mode: BORDER_REPLICATE + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 192 + output_tensor_height: 192 + output_tensor_float_range { + min: 0.0 + max: 1.0 + } + gpu_origin: TOP_LEFT } } } -# Transforms the input image on GPU to a 192x192 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE_GPU:face_region" - output_stream: "IMAGE_GPU:transformed_face_region" - options: { - [mediapipe.ImageTransformationCalculatorOptions.ext] { - output_width: 192 - output_height: 192 - } - } -} - -# Converts the transformed input image on GPU into an image tensor stored as a -# TfLiteTensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE_GPU:transformed_face_region" - output_stream: "TENSORS_GPU:input_tensor" -} - # Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a # vector of GPU tensors representing, for instance, detection boxes/keypoints # and scores. node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS_GPU:input_tensor" + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensors" output_stream: "TENSORS:output_tensors" options: { - [mediapipe.TfLiteInferenceCalculatorOptions.ext] { + [mediapipe.InferenceCalculatorOptions.ext] { model_path: "mediapipe/modules/face_landmark/face_landmark.tflite" } } @@ -80,7 +63,7 @@ node { # Splits a vector of tensors into multiple vectors. node { - calculator: "SplitTfLiteTensorVectorCalculator" + calculator: "SplitTensorVectorCalculator" input_stream: "output_tensors" output_stream: "landmark_tensors" output_stream: "face_flag_tensor" @@ -95,7 +78,7 @@ node { # Converts the face-flag tensor into a float that represents the confidence # score of face presence. node { - calculator: "TfLiteTensorsToFloatsCalculator" + calculator: "TensorsToFloatsCalculator" input_stream: "TENSORS:face_flag_tensor" output_stream: "FLOAT:face_presence_score" } @@ -124,11 +107,11 @@ node { # Decodes the landmark tensors into a vector of lanmarks, where the landmark # coordinates are normalized by the size of the input image to the model. node { - calculator: "TfLiteTensorsToLandmarksCalculator" + calculator: "TensorsToLandmarksCalculator" input_stream: "TENSORS:ensured_landmark_tensors" output_stream: "NORM_LANDMARKS:landmarks" options: { - [mediapipe.TfLiteTensorsToLandmarksCalculatorOptions.ext] { + [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { num_landmarks: 468 input_image_width: 192 input_image_height: 192 diff --git a/mediapipe/modules/hand_landmark/BUILD b/mediapipe/modules/hand_landmark/BUILD new file mode 100644 index 0000000000..274147105c --- /dev/null +++ b/mediapipe/modules/hand_landmark/BUILD @@ -0,0 +1,130 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load( + "//mediapipe/framework/tool:mediapipe_graph.bzl", + "mediapipe_simple_subgraph", +) + +licenses(["notice"]) + +package(default_visibility = ["//visibility:public"]) + +exports_files([ + "hand_landmark.tflite", + "handedness.txt", +]) + +mediapipe_simple_subgraph( + name = "hand_landmark_cpu", + graph = "hand_landmark_cpu.pbtxt", + register_as = "HandLandmarkCpu", + deps = [ + "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/core:split_vector_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator", + "//mediapipe/calculators/tensor:tensors_to_classification_calculator", + "//mediapipe/calculators/tensor:tensors_to_floats_calculator", + "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator", + "//mediapipe/calculators/util:landmark_letterbox_removal_calculator", + "//mediapipe/calculators/util:landmark_projection_calculator", + "//mediapipe/calculators/util:logic_calculator", + "//mediapipe/calculators/util:thresholding_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "hand_landmark_gpu", + graph = "hand_landmark_gpu.pbtxt", + register_as = "HandLandmarkGpu", + deps = [ + "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/core:split_vector_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator", + "//mediapipe/calculators/tensor:tensors_to_classification_calculator", + "//mediapipe/calculators/tensor:tensors_to_floats_calculator", + "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator", + "//mediapipe/calculators/util:landmark_letterbox_removal_calculator", + "//mediapipe/calculators/util:landmark_projection_calculator", + "//mediapipe/calculators/util:logic_calculator", + "//mediapipe/calculators/util:thresholding_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "hand_landmark_tracking_gpu", + graph = "hand_landmark_tracking_gpu.pbtxt", + register_as = "HandLandmarkTrackingGpu", + deps = [ + ":hand_landmark_gpu", + ":hand_landmark_landmarks_to_roi", + ":palm_detection_detection_to_roi", + "//mediapipe/calculators/core:begin_loop_calculator", + "//mediapipe/calculators/core:clip_vector_size_calculator", + "//mediapipe/calculators/core:end_loop_calculator", + "//mediapipe/calculators/core:flow_limiter_calculator", + "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/core:previous_loopback_calculator", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/util:association_norm_rect_calculator", + "//mediapipe/calculators/util:collection_has_min_size_calculator", + "//mediapipe/calculators/util:filter_collection_calculator", + "//mediapipe/modules/palm_detection:palm_detection_gpu", + ], +) + +mediapipe_simple_subgraph( + name = "hand_landmark_tracking_cpu", + graph = "hand_landmark_tracking_cpu.pbtxt", + register_as = "HandLandmarkTrackingCpu", + deps = [ + ":hand_landmark_cpu", + ":hand_landmark_landmarks_to_roi", + ":palm_detection_detection_to_roi", + "//mediapipe/calculators/core:begin_loop_calculator", + "//mediapipe/calculators/core:clip_vector_size_calculator", + "//mediapipe/calculators/core:end_loop_calculator", + "//mediapipe/calculators/core:flow_limiter_calculator", + "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/core:previous_loopback_calculator", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/util:association_norm_rect_calculator", + "//mediapipe/calculators/util:collection_has_min_size_calculator", + "//mediapipe/calculators/util:filter_collection_calculator", + "//mediapipe/modules/palm_detection:palm_detection_gpu", + ], +) + +mediapipe_simple_subgraph( + name = "palm_detection_detection_to_roi", + graph = "palm_detection_detection_to_roi.pbtxt", + register_as = "PalmDetectionDetectionToRoi", + deps = [ + "//mediapipe/calculators/util:detections_to_rects_calculator", + "//mediapipe/calculators/util:rect_transformation_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "hand_landmark_landmarks_to_roi", + graph = "hand_landmark_landmarks_to_roi.pbtxt", + register_as = "HandLandmarkLandmarksToRoi", + deps = [ + "//mediapipe/calculators/core:split_normalized_landmark_list_calculator", + "//mediapipe/calculators/util:rect_transformation_calculator", + "//mediapipe/modules/hand_landmark/calculators:hand_landmarks_to_rect_calculator", + ], +) diff --git a/mediapipe/modules/hand_landmark/README.md b/mediapipe/modules/hand_landmark/README.md new file mode 100644 index 0000000000..31fe6f720f --- /dev/null +++ b/mediapipe/modules/hand_landmark/README.md @@ -0,0 +1,8 @@ +# hand_landmark + +Subgraphs|Details +:--- | :--- +[`HandLandmarkCpu`](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark/hand_landmark_cpu.pbtxt)| Detects landmarks of a single hand. (CPU input.) +[`HandLandmarkGpu`](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark/hand_landmark_gpu.pbtxt)| Detects landmarks of a single hand. (GPU input.) +[`HandLandmarkTrackingCpu`](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark/hand_landmark_tracking_cpu.pbtxt)| Detects and tracks landmarks of multiple hands. (CPU input.) +[`HandLandmarkTrackingGpu`](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark/hand_landmark_tracking_gpu.pbtxt)| Detects and tracks landmarks of multiple hands. (GPU input.) diff --git a/mediapipe/modules/hand_landmark/calculators/BUILD b/mediapipe/modules/hand_landmark/calculators/BUILD new file mode 100644 index 0000000000..b2a8efe370 --- /dev/null +++ b/mediapipe/modules/hand_landmark/calculators/BUILD @@ -0,0 +1,33 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +licenses(["notice"]) + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "hand_landmarks_to_rect_calculator", + srcs = ["hand_landmarks_to_rect_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework:calculator_options_cc_proto", + "//mediapipe/framework/formats:landmark_cc_proto", + "//mediapipe/framework/formats:location_data_cc_proto", + "//mediapipe/framework/formats:rect_cc_proto", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + ], + alwayslink = 1, +) diff --git a/mediapipe/graphs/hand_tracking/calculators/hand_landmarks_to_rect_calculator.cc b/mediapipe/modules/hand_landmark/calculators/hand_landmarks_to_rect_calculator.cc similarity index 100% rename from mediapipe/graphs/hand_tracking/calculators/hand_landmarks_to_rect_calculator.cc rename to mediapipe/modules/hand_landmark/calculators/hand_landmarks_to_rect_calculator.cc diff --git a/mediapipe/modules/hand_landmark/hand_landmark.tflite b/mediapipe/modules/hand_landmark/hand_landmark.tflite new file mode 100644 index 0000000000..383135e51c Binary files /dev/null and b/mediapipe/modules/hand_landmark/hand_landmark.tflite differ diff --git a/mediapipe/modules/hand_landmark/hand_landmark_cpu.pbtxt b/mediapipe/modules/hand_landmark/hand_landmark_cpu.pbtxt new file mode 100644 index 0000000000..246a1fc2e6 --- /dev/null +++ b/mediapipe/modules/hand_landmark/hand_landmark_cpu.pbtxt @@ -0,0 +1,163 @@ +# MediaPipe graph to detect/predict hand landmarks on CPU. + +type: "HandLandmarkCpu" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" +# ROI (region of interest) within the given image where a palm/hand is located. +# (NormalizedRect) +input_stream: "ROI:hand_rect" + +# 21 hand landmarks within the given ROI. (NormalizedLandmarkList) +# NOTE: if a hand is not present within the given ROI, for this particular +# timestamp there will not be an output packet in the LANDMARKS stream. However, +# the MediaPipe framework will internally inform the downstream calculators of +# the absence of this packet so that they don't wait for it unnecessarily. +output_stream: "LANDMARKS:hand_landmarks" + +# Handedness of the detected hand (i.e. is hand left or right). +# (ClassificationList) +output_stream: "HANDEDNESS:handedness" + +# Transforms a region of image into a 224x224 tensor while keeping the aspect +# ratio, and therefore may result in potential letterboxing. +node { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE:image" + input_stream: "NORM_RECT:hand_rect" + output_stream: "TENSORS:input_tensor" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + options: { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 224 + output_tensor_height: 224 + keep_aspect_ratio: true + output_tensor_float_range { + min: 0.0 + max: 1.0 + } + } + } +} + +# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensor" + output_stream: "TENSORS:output_tensors" + options: { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/modules/hand_landmark/hand_landmark.tflite" + delegate { xnnpack {} } + } + } +} + +# Splits a vector of tensors to multiple vectors according to the ranges +# specified in option. +node { + calculator: "SplitTensorVectorCalculator" + input_stream: "output_tensors" + output_stream: "landmark_tensors" + output_stream: "hand_flag_tensor" + output_stream: "handedness_tensor" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 0 end: 1 } + ranges: { begin: 1 end: 2 } + ranges: { begin: 2 end: 3 } + } + } +} + +# Converts the hand-flag tensor into a float that represents the confidence +# score of hand presence. +node { + calculator: "TensorsToFloatsCalculator" + input_stream: "TENSORS:hand_flag_tensor" + output_stream: "FLOAT:hand_presence_score" +} + +# Applies a threshold to the confidence score to determine whether a hand is +# present. +node { + calculator: "ThresholdingCalculator" + input_stream: "FLOAT:hand_presence_score" + output_stream: "FLAG:hand_presence" + options: { + [mediapipe.ThresholdingCalculatorOptions.ext] { + threshold: 0.5 + } + } +} + +# Drops handedness tensor if hand is not present. +node { + calculator: "GateCalculator" + input_stream: "handedness_tensor" + input_stream: "ALLOW:hand_presence" + output_stream: "ensured_handedness_tensor" +} + +# Converts the handedness tensor into a float that represents the classification +# score of handedness. +node { + calculator: "TensorsToClassificationCalculator" + input_stream: "TENSORS:ensured_handedness_tensor" + output_stream: "CLASSIFICATIONS:handedness" + options: { + [mediapipe.TensorsToClassificationCalculatorOptions.ext] { + top_k: 1 + label_map_path: "mediapipe/modules/hand_landmark/handedness.txt" + binary_classification: true + } + } +} + +# Drops landmarks tensors if hand is not present. +node { + calculator: "GateCalculator" + input_stream: "landmark_tensors" + input_stream: "ALLOW:hand_presence" + output_stream: "ensured_landmark_tensors" +} + +# Decodes the landmark tensors into a list of landmarks, where the landmark +# coordinates are normalized by the size of the input image to the model. +node { + calculator: "TensorsToLandmarksCalculator" + input_stream: "TENSORS:ensured_landmark_tensors" + output_stream: "NORM_LANDMARKS:landmarks" + options: { + [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { + num_landmarks: 21 + input_image_width: 224 + input_image_height: 224 + # The additional scaling factor is used to account for the Z coordinate + # distribution in the training data. + normalize_z: 0.4 + } + } +} + +# Adjusts landmarks (already normalized to [0.f, 1.f]) on the letterboxed hand +# image (after image transformation with the FIT scale mode) to the +# corresponding locations on the same image with the letterbox removed (hand +# image before image transformation). +node { + calculator: "LandmarkLetterboxRemovalCalculator" + input_stream: "LANDMARKS:landmarks" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "LANDMARKS:scaled_landmarks" +} + +# Projects the landmarks from the cropped hand image to the corresponding +# locations on the full image before cropping (input to the graph). +node { + calculator: "LandmarkProjectionCalculator" + input_stream: "NORM_LANDMARKS:scaled_landmarks" + input_stream: "NORM_RECT:hand_rect" + output_stream: "NORM_LANDMARKS:hand_landmarks" +} diff --git a/mediapipe/modules/hand_landmark/hand_landmark_gpu.pbtxt b/mediapipe/modules/hand_landmark/hand_landmark_gpu.pbtxt new file mode 100644 index 0000000000..071a72f8f6 --- /dev/null +++ b/mediapipe/modules/hand_landmark/hand_landmark_gpu.pbtxt @@ -0,0 +1,163 @@ +# MediaPipe graph to detect/predict hand landmarks on CPU. + +type: "HandLandmarkGpu" + +# GPU image. (GpuBuffer) +input_stream: "IMAGE:image" +# ROI (region of interest) within the given image where a palm/hand is located. +# (NormalizedRect) +input_stream: "ROI:hand_rect" + +# 21 hand landmarks within the given ROI. (NormalizedLandmarkList) +# NOTE: if a hand is not present within the given ROI, for this particular +# timestamp there will not be an output packet in the LANDMARKS stream. However, +# the MediaPipe framework will internally inform the downstream calculators of +# the absence of this packet so that they don't wait for it unnecessarily. +output_stream: "LANDMARKS:hand_landmarks" + +# Handedness of the detected hand (i.e. is hand left or right). +# (ClassificationList) +output_stream: "HANDEDNESS:handedness" + +# Transforms a region of image into a 224x224 tensor while keeping the aspect +# ratio, and therefore may result in potential letterboxing. +node { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE_GPU:image" + input_stream: "NORM_RECT:hand_rect" + output_stream: "TENSORS:input_tensor" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + options: { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 224 + output_tensor_height: 224 + keep_aspect_ratio: true + output_tensor_float_range { + min: 0.0 + max: 1.0 + } + gpu_origin: TOP_LEFT + } + } +} + +# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensor" + output_stream: "TENSORS:output_tensors" + options: { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/modules/hand_landmark/hand_landmark.tflite" + } + } +} + +# Splits a vector of tensors to multiple vectors according to the ranges +# specified in option. +node { + calculator: "SplitTensorVectorCalculator" + input_stream: "output_tensors" + output_stream: "landmark_tensors" + output_stream: "hand_flag_tensor" + output_stream: "handedness_tensor" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 0 end: 1 } + ranges: { begin: 1 end: 2 } + ranges: { begin: 2 end: 3 } + } + } +} + +# Converts the hand-flag tensor into a float that represents the confidence +# score of hand presence. +node { + calculator: "TensorsToFloatsCalculator" + input_stream: "TENSORS:hand_flag_tensor" + output_stream: "FLOAT:hand_presence_score" +} + +# Applies a threshold to the confidence score to determine whether a hand is +# present. +node { + calculator: "ThresholdingCalculator" + input_stream: "FLOAT:hand_presence_score" + output_stream: "FLAG:hand_presence" + options: { + [mediapipe.ThresholdingCalculatorOptions.ext] { + threshold: 0.5 + } + } +} + +# Drops handedness tensor if hand is not present. +node { + calculator: "GateCalculator" + input_stream: "handedness_tensor" + input_stream: "ALLOW:hand_presence" + output_stream: "ensured_handedness_tensor" +} + +# Converts the handedness tensor into a float that represents the classification +# score of handedness. +node { + calculator: "TensorsToClassificationCalculator" + input_stream: "TENSORS:ensured_handedness_tensor" + output_stream: "CLASSIFICATIONS:handedness" + options: { + [mediapipe.TensorsToClassificationCalculatorOptions.ext] { + top_k: 1 + label_map_path: "mediapipe/modules/hand_landmark/handedness.txt" + binary_classification: true + } + } +} + +# Drops landmarks tensors if hand is not present. +node { + calculator: "GateCalculator" + input_stream: "landmark_tensors" + input_stream: "ALLOW:hand_presence" + output_stream: "ensured_landmark_tensors" +} + +# Decodes the landmark tensors into a list of landmarks, where the landmark +# coordinates are normalized by the size of the input image to the model. +node { + calculator: "TensorsToLandmarksCalculator" + input_stream: "TENSORS:ensured_landmark_tensors" + output_stream: "NORM_LANDMARKS:landmarks" + options: { + [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { + num_landmarks: 21 + input_image_width: 224 + input_image_height: 224 + # The additional scaling factor is used to account for the Z coordinate + # distribution in the training data. + normalize_z: 0.4 + } + } +} + +# Adjusts landmarks (already normalized to [0.f, 1.f]) on the letterboxed hand +# image (after image transformation with the FIT scale mode) to the +# corresponding locations on the same image with the letterbox removed (hand +# image before image transformation). +node { + calculator: "LandmarkLetterboxRemovalCalculator" + input_stream: "LANDMARKS:landmarks" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "LANDMARKS:scaled_landmarks" +} + +# Projects the landmarks from the cropped hand image to the corresponding +# locations on the full image before cropping (input to the graph). +node { + calculator: "LandmarkProjectionCalculator" + input_stream: "NORM_LANDMARKS:scaled_landmarks" + input_stream: "NORM_RECT:hand_rect" + output_stream: "NORM_LANDMARKS:hand_landmarks" +} diff --git a/mediapipe/modules/hand_landmark/hand_landmark_landmarks_to_roi.pbtxt b/mediapipe/modules/hand_landmark/hand_landmark_landmarks_to_roi.pbtxt new file mode 100644 index 0000000000..1d82d76722 --- /dev/null +++ b/mediapipe/modules/hand_landmark/hand_landmark_landmarks_to_roi.pbtxt @@ -0,0 +1,63 @@ +# MediaPipe graph to calculate hand region of interest (ROI) from landmarks +# detected by "HandLandmarkCpu" or "HandLandmarkGpu". + +type: "HandLandmarkLandmarksToRoi" + +# Normalized landmarks. (NormalizedLandmarkList) +input_stream: "LANDMARKS:landmarks" +# Image size (width & height). (std::pair) +input_stream: "IMAGE_SIZE:image_size" + +# ROI according to landmarks. (NormalizedRect) +output_stream: "ROI:roi" + +# Extracts a subset of the hand landmarks that are relatively more stable across +# frames (e.g. comparing to finger tips) for computing the bounding box. The box +# will later be expanded to contain the entire hand. In this approach, it is +# more robust to drastically changing hand size. +# The landmarks extracted are: wrist, MCP/PIP of five fingers. +node { + calculator: "SplitNormalizedLandmarkListCalculator" + input_stream: "landmarks" + output_stream: "partial_landmarks" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 0 end: 4 } + ranges: { begin: 5 end: 7 } + ranges: { begin: 9 end: 11 } + ranges: { begin: 13 end: 15 } + ranges: { begin: 17 end: 19 } + combine_outputs: true + } + } +} + +# Converts the hand landmarks into a rectangle (normalized by image size) +# that encloses the hand. The calculator uses a subset of all hand landmarks +# extracted from SplitNormalizedLandmarkListCalculator above to +# calculate the bounding box and the rotation of the output rectangle. Please +# see the comments in the calculator for more detail. +node { + calculator: "HandLandmarksToRectCalculator" + input_stream: "NORM_LANDMARKS:partial_landmarks" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "NORM_RECT:hand_rect_from_landmarks" +} + +# Expands the hand rectangle so that the box contains the entire hand and it's +# big enough so that it's likely to still contain the hand even with some motion +# in the next video frame . +node { + calculator: "RectTransformationCalculator" + input_stream: "NORM_RECT:hand_rect_from_landmarks" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "roi" + options: { + [mediapipe.RectTransformationCalculatorOptions.ext] { + scale_x: 2.0 + scale_y: 2.0 + shift_y: -0.1 + square_long: true + } + } +} diff --git a/mediapipe/modules/hand_landmark/hand_landmark_tracking_cpu.pbtxt b/mediapipe/modules/hand_landmark/hand_landmark_tracking_cpu.pbtxt new file mode 100644 index 0000000000..470bfc0000 --- /dev/null +++ b/mediapipe/modules/hand_landmark/hand_landmark_tracking_cpu.pbtxt @@ -0,0 +1,241 @@ +# MediaPipe graph to detect/predict hand landmarks on CPU. +# +# The procedure is done in two steps: +# - locate palms/hands +# - detect landmarks for each palm/hand. +# This graph tries to skip palm detection as much as possible by reusing +# previously detected/predicted landmarks for new images. + +type: "HandLandmarkTrackingCpu" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" + +# Max number of hands to detect/track. (int) +input_side_packet: "NUM_HANDS:num_hands" + +# Whether palm detection can be skipped when hand regions can already be +# approximated from hand landmarks on the previous frame. +input_side_packet: "CAN_SKIP_DETECTION:can_skip_detection" + +# Collection of detected/predicted hands, each represented as a list of +# landmarks. (std::vector) +# NOTE: there will not be an output packet in the LANDMARKS stream for this +# particular timestamp if none of hands detected. However, the MediaPipe +# framework will internally inform the downstream calculators of the absence of +# this packet so that they don't wait for it unnecessarily. +output_stream: "LANDMARKS:multi_hand_landmarks" +# Collection of handedness of the detected hands (i.e. is hand left or right), +# each represented as a Classification proto. +# Note that handedness is determined assuming the input image is mirrored, +# i.e., taken with a front-facing/selfie camera with images flipped +# horizontally. +output_stream: "HANDEDNESS:multi_handedness" + +# Extra outputs (for debugging, for instance). +# Detected palms. (std::vector) +output_stream: "PALM_DETECTIONS:palm_detections" +# Regions of interest calculated based on landmarks. +# (std::vector) +output_stream: "HAND_ROIS_FROM_LANDMARKS:hand_rects" +# Regions of interest calculated based on palm detections. +# (std::vector) +output_stream: "HAND_ROIS_FROM_PALM_DETECTIONS:hand_rects_from_palm_detections" + +# Determines if an input vector of NormalizedRect has a size greater than or +# equal to the provided num_hands. +node { + calculator: "NormalizedRectVectorHasMinSizeCalculator" + input_stream: "ITERABLE:prev_hand_rects_from_landmarks" + input_side_packet: "num_hands" + output_stream: "prev_has_enough_hands" +} + +# Drops the incoming image if the previous image had at least N hands. +# and skipping palm detection is enabled. +# Otherwise, passes the incoming image through to trigger a new round of palm +# detection. +node { + calculator: "LogicCalculator" + options: { + [mediapipe.LogicCalculatorOptions.ext] { op: AND } + } + input_side_packet: "can_skip_detection" + input_stream: "prev_has_enough_hands" + output_stream: "skip_detection" +} + +node { + calculator: "GateCalculator" + input_stream: "image" + input_stream: "DISALLOW:skip_detection" + output_stream: "palm_detection_image" + options: { + [mediapipe.GateCalculatorOptions.ext] { + empty_packets_as_allow: true + } + } +} +node { + calculator: "GateCalculator" + input_stream: "prev_hand_rects_from_landmarks" + input_stream: "ALLOW:skip_detection" + output_stream: "gated_prev_hand_rects_from_landmarks" +} + +# Detects palms. +node { + calculator: "PalmDetectionCpu" + input_stream: "IMAGE:palm_detection_image" + output_stream: "DETECTIONS:all_palm_detections" +} + +# Makes sure there are no more detections than the provided num_hands. +node { + calculator: "ClipDetectionVectorSizeCalculator" + input_stream: "all_palm_detections" + output_stream: "palm_detections" + input_side_packet: "num_hands" +} + +# Extracts image size. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE:palm_detection_image" + output_stream: "SIZE:palm_detection_image_size" +} + +# Outputs each element of palm_detections at a fake timestamp for the rest of +# the graph to process. Clones the image size packet for each palm_detection at +# the fake timestamp. At the end of the loop, outputs the BATCH_END timestamp +# for downstream calculators to inform them that all elements in the vector have +# been processed. +node { + calculator: "BeginLoopDetectionCalculator" + input_stream: "ITERABLE:palm_detections" + input_stream: "CLONE:palm_detection_image_size" + output_stream: "ITEM:palm_detection" + output_stream: "CLONE:image_size_for_palms" + output_stream: "BATCH_END:palm_detections_timestamp" +} + +# Calculates region of interest (ROI) based on the specified palm. +node { + calculator: "PalmDetectionDetectionToRoi" + input_stream: "DETECTION:palm_detection" + input_stream: "IMAGE_SIZE:image_size_for_palms" + output_stream: "ROI:hand_rect_from_palm_detection" +} + +# Collects a NormalizedRect for each hand into a vector. Upon receiving the +# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END +# timestamp. +node { + calculator: "EndLoopNormalizedRectCalculator" + input_stream: "ITEM:hand_rect_from_palm_detection" + input_stream: "BATCH_END:palm_detections_timestamp" + output_stream: "ITERABLE:hand_rects_from_palm_detections" +} + +# Performs association between NormalizedRect vector elements from previous +# image and rects based on palm detections from the current image. This +# calculator ensures that the output hand_rects vector doesn't contain +# overlapping regions based on the specified min_similarity_threshold. +node { + calculator: "AssociationNormRectCalculator" + input_stream: "gated_prev_hand_rects_from_landmarks" + input_stream: "hand_rects_from_palm_detections" + output_stream: "hand_rects" + options: { + [mediapipe.AssociationCalculatorOptions.ext] { + min_similarity_threshold: 0.5 + } + } +} + +# Extracts image size. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE:image" + output_stream: "SIZE:image_size" +} + +# Outputs each element of hand_rects at a fake timestamp for the rest of the +# graph to process. Clones image and image size packets for each +# single_hand_rect at the fake timestamp. At the end of the loop, outputs the +# BATCH_END timestamp for downstream calculators to inform them that all +# elements in the vector have been processed. +node { + calculator: "BeginLoopNormalizedRectCalculator" + input_stream: "ITERABLE:hand_rects" + input_stream: "CLONE:0:image" + input_stream: "CLONE:1:image_size" + output_stream: "ITEM:single_hand_rect" + output_stream: "CLONE:0:image_for_landmarks" + output_stream: "CLONE:1:image_size_for_landmarks" + output_stream: "BATCH_END:hand_rects_timestamp" +} + +# Detect hand landmarks for the specific hand rect. +node { + calculator: "HandLandmarkCpu" + input_stream: "IMAGE:image_for_landmarks" + input_stream: "ROI:single_hand_rect" + output_stream: "LANDMARKS:single_hand_landmarks" + output_stream: "HANDEDNESS:single_handedness" +} + +# Collects the handedness for each single hand into a vector. Upon +# receiving the BATCH_END timestamp, outputs a vector of classification at the +# BATCH_END timestamp. +node { + calculator: "EndLoopClassificationListCalculator" + input_stream: "ITEM:single_handedness" + input_stream: "BATCH_END:hand_rects_timestamp" + output_stream: "ITERABLE:multi_handedness" +} + +# Calculate region of interest (ROI) based on detected hand landmarks to reuse +# on the subsequent runs of the graph. +node { + calculator: "HandLandmarkLandmarksToRoi" + input_stream: "IMAGE_SIZE:image_size_for_landmarks" + input_stream: "LANDMARKS:single_hand_landmarks" + output_stream: "ROI:single_hand_rect_from_landmarks" +} + +# Collects a set of landmarks for each hand into a vector. Upon receiving the +# BATCH_END timestamp, outputs the vector of landmarks at the BATCH_END +# timestamp. +node { + calculator: "EndLoopNormalizedLandmarkListVectorCalculator" + input_stream: "ITEM:single_hand_landmarks" + input_stream: "BATCH_END:hand_rects_timestamp" + output_stream: "ITERABLE:multi_hand_landmarks" +} + +# Collects a NormalizedRect for each hand into a vector. Upon receiving the +# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END +# timestamp. +node { + calculator: "EndLoopNormalizedRectCalculator" + input_stream: "ITEM:single_hand_rect_from_landmarks" + input_stream: "BATCH_END:hand_rects_timestamp" + output_stream: "ITERABLE:hand_rects_from_landmarks" +} + +# Caches hand rects calculated from landmarks, and upon the arrival of the next +# input image, sends out the cached rects with timestamps replaced by that of +# the input image, essentially generating a packet that carries the previous +# hand rects. Note that upon the arrival of the very first input image, a +# timestamp bound update occurs to jump start the feedback loop. +node { + calculator: "PreviousLoopbackCalculator" + input_stream: "MAIN:image" + input_stream: "LOOP:hand_rects_from_landmarks" + input_stream_info: { + tag_index: "LOOP" + back_edge: true + } + output_stream: "PREV_LOOP:prev_hand_rects_from_landmarks" +} diff --git a/mediapipe/modules/hand_landmark/hand_landmark_tracking_gpu.pbtxt b/mediapipe/modules/hand_landmark/hand_landmark_tracking_gpu.pbtxt new file mode 100644 index 0000000000..640676a3d7 --- /dev/null +++ b/mediapipe/modules/hand_landmark/hand_landmark_tracking_gpu.pbtxt @@ -0,0 +1,241 @@ +# MediaPipe graph to detect/predict hand landmarks on GPU. +# +# The procedure is done in two steps: +# - locate palms/hands +# - detect landmarks for each palm/hand. +# This graph tries to skip palm detection as much as possible by reusing +# previously detected/predicted landmarks for new images. + +type: "HandLandmarkTrackingGpu" + +# GPU image. (GpuBuffer) +input_stream: "IMAGE:image" + +# Max number of hands to detect/track. (int) +input_side_packet: "NUM_HANDS:num_hands" + +# Whether palm detection can be skipped when hand regions can already be +# approximated from hand landmarks on the previous frame. +input_side_packet: "CAN_SKIP_DETECTION:can_skip_detection" + +# Collection of detected/predicted hands, each represented as a list of +# landmarks. (std::vector) +# NOTE: there will not be an output packet in the LANDMARKS stream for this +# particular timestamp if none of hands detected. However, the MediaPipe +# framework will internally inform the downstream calculators of the absence of +# this packet so that they don't wait for it unnecessarily. +output_stream: "LANDMARKS:multi_hand_landmarks" +# Collection of handedness of the detected hands (i.e. is hand left or right), +# each represented as a Classification proto. +# Note that handedness is determined assuming the input image is mirrored, +# i.e., taken with a front-facing/selfie camera with images flipped +# horizontally. +output_stream: "HANDEDNESS:multi_handedness" + +# Extra outputs (for debugging, for instance). +# Detected palms. (std::vector) +output_stream: "PALM_DETECTIONS:palm_detections" +# Regions of interest calculated based on landmarks. +# (std::vector) +output_stream: "HAND_ROIS_FROM_LANDMARKS:hand_rects" +# Regions of interest calculated based on palm detections. +# (std::vector) +output_stream: "HAND_ROIS_FROM_PALM_DETECTIONS:hand_rects_from_palm_detections" + +# Determines if an input vector of NormalizedRect has a size greater than or +# equal to the provided num_hands. +node { + calculator: "NormalizedRectVectorHasMinSizeCalculator" + input_stream: "ITERABLE:prev_hand_rects_from_landmarks" + input_side_packet: "num_hands" + output_stream: "prev_has_enough_hands" +} + +# Drops the incoming image if the previous image had at least N hands. +# and skipping palm detection is enabled. +# Otherwise, passes the incoming image through to trigger a new round of palm +# detection. +node { + calculator: "LogicCalculator" + options: { + [mediapipe.LogicCalculatorOptions.ext] { op: AND } + } + input_side_packet: "can_skip_detection" + input_stream: "prev_has_enough_hands" + output_stream: "skip_detection" +} +node { + calculator: "GateCalculator" + input_stream: "image" + input_stream: "DISALLOW:skip_detection" + output_stream: "palm_detection_image" + options: { + [mediapipe.GateCalculatorOptions.ext] { + empty_packets_as_allow: true + } + } +} +node { + calculator: "GateCalculator" + input_stream: "prev_hand_rects_from_landmarks" + input_stream: "ALLOW:skip_detection" + output_stream: "gated_prev_hand_rects_from_landmarks" +} + +# Detects palms. +node { + calculator: "PalmDetectionGpu" + input_stream: "IMAGE:palm_detection_image" + output_stream: "DETECTIONS:all_palm_detections" +} + +# Makes sure there are no more detections than provided num_hands. +node { + calculator: "ClipDetectionVectorSizeCalculator" + input_stream: "all_palm_detections" + output_stream: "palm_detections" + input_side_packet: "num_hands" +} + +# Extracts image size. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE_GPU:palm_detection_image" + output_stream: "SIZE:palm_detection_image_size" +} + +# Outputs each element of palm_detections at a fake timestamp for the rest of +# the graph to process. Clones the image_size packet for each palm_detection at +# the fake timestamp. At the end of the loop, outputs the BATCH_END timestamp +# for downstream calculators to inform them that all elements in the vector have +# been processed. +node { + calculator: "BeginLoopDetectionCalculator" + input_stream: "ITERABLE:palm_detections" + input_stream: "CLONE:palm_detection_image_size" + output_stream: "ITEM:palm_detection" + output_stream: "CLONE:image_size_for_palms" + output_stream: "BATCH_END:palm_detections_timestamp" +} + +# Calculates region of interest (ROI) base on the specified palm. +node { + calculator: "PalmDetectionDetectionToRoi" + input_stream: "DETECTION:palm_detection" + input_stream: "IMAGE_SIZE:image_size_for_palms" + output_stream: "ROI:hand_rect_from_palm_detection" +} + +# Collects a NormalizedRect for each hand into a vector. Upon receiving the +# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END +# timestamp. +node { + name: "EndLoopForPalmDetections" + calculator: "EndLoopNormalizedRectCalculator" + input_stream: "ITEM:hand_rect_from_palm_detection" + input_stream: "BATCH_END:palm_detections_timestamp" + output_stream: "ITERABLE:hand_rects_from_palm_detections" +} + +# Performs association between NormalizedRect vector elements from previous +# image and rects based on palm detections from the current image. This +# calculator ensures that the output hand_rects vector doesn't contain +# overlapping regions based on the specified min_similarity_threshold. +node { + calculator: "AssociationNormRectCalculator" + input_stream: "gated_prev_hand_rects_from_landmarks" + input_stream: "hand_rects_from_palm_detections" + output_stream: "hand_rects" + options: { + [mediapipe.AssociationCalculatorOptions.ext] { + min_similarity_threshold: 0.5 + } + } +} + +# Extracts image size. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE_GPU:image" + output_stream: "SIZE:image_size" +} + +# Outputs each element of hand_rects at a fake timestamp for the rest of the +# graph to process. Clones image and image size packets for each +# single_hand_rect at the fake timestamp. At the end of the loop, outputs the +# BATCH_END timestamp for downstream calculators to inform them that all +# elements in the vector have been processed. +node { + calculator: "BeginLoopNormalizedRectCalculator" + input_stream: "ITERABLE:hand_rects" + input_stream: "CLONE:0:image" + input_stream: "CLONE:1:image_size" + output_stream: "ITEM:single_hand_rect" + output_stream: "CLONE:0:image_for_landmarks" + output_stream: "CLONE:1:image_size_for_landmarks" + output_stream: "BATCH_END:hand_rects_timestamp" +} + +# Detect hand landmarks for the specific hand rect. +node { + calculator: "HandLandmarkGpu" + input_stream: "IMAGE:image_for_landmarks" + input_stream: "ROI:single_hand_rect" + output_stream: "LANDMARKS:single_hand_landmarks" + output_stream: "HANDEDNESS:single_handedness" +} + +# Collects the handedness for each single hand into a vector. Upon +# receiving the BATCH_END timestamp, outputs a vector of classification at the +# BATCH_END timestamp. +node { + calculator: "EndLoopClassificationListCalculator" + input_stream: "ITEM:single_handedness" + input_stream: "BATCH_END:hand_rects_timestamp" + output_stream: "ITERABLE:multi_handedness" +} + +# Calculate region of interest (ROI) based on detected hand landmarks to reuse +# on the subsequent runs of the graph. +node { + calculator: "HandLandmarkLandmarksToRoi" + input_stream: "IMAGE_SIZE:image_size_for_landmarks" + input_stream: "LANDMARKS:single_hand_landmarks" + output_stream: "ROI:single_hand_rect_from_landmarks" +} + +# Collects a set of landmarks for each hand into a vector. Upon receiving the +# BATCH_END timestamp, outputs the vector of landmarks at the BATCH_END +# timestamp. +node { + calculator: "EndLoopNormalizedLandmarkListVectorCalculator" + input_stream: "ITEM:single_hand_landmarks" + input_stream: "BATCH_END:hand_rects_timestamp" + output_stream: "ITERABLE:multi_hand_landmarks" +} + +# Collects a NormalizedRect for each hand into a vector. Upon receiving the +# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END +# timestamp. +node { + calculator: "EndLoopNormalizedRectCalculator" + input_stream: "ITEM:single_hand_rect_from_landmarks" + input_stream: "BATCH_END:hand_rects_timestamp" + output_stream: "ITERABLE:hand_rects_from_landmarks" +} + +# Caches hand rects calculated from landmarks, and upon the arrival of the next +# input image, sends out the cached rects with timestamps replaced by that of +# the input image, essentially generating a packet that carries the previous +# hand rects. Note that upon the arrival of the very first input image, a +# timestamp bound update occurs to jump start the feedback loop. +node { + calculator: "PreviousLoopbackCalculator" + input_stream: "MAIN:image" + input_stream: "LOOP:hand_rects_from_landmarks" + input_stream_info: { + tag_index: "LOOP" + back_edge: true + } + output_stream: "PREV_LOOP:prev_hand_rects_from_landmarks" +} diff --git a/mediapipe/models/handedness.txt b/mediapipe/modules/hand_landmark/handedness.txt similarity index 100% rename from mediapipe/models/handedness.txt rename to mediapipe/modules/hand_landmark/handedness.txt diff --git a/mediapipe/modules/hand_landmark/palm_detection_detection_to_roi.pbtxt b/mediapipe/modules/hand_landmark/palm_detection_detection_to_roi.pbtxt new file mode 100644 index 0000000000..838633b86c --- /dev/null +++ b/mediapipe/modules/hand_landmark/palm_detection_detection_to_roi.pbtxt @@ -0,0 +1,47 @@ +# MediaPipe subgraph that calculates hand ROI from palm detection. + +type: "PalmDetectionDetectionToRoi" + +# Palm detection. (Detection) +input_stream: "DETECTION:detection" +# Frame size. (std::pair) +input_stream: "IMAGE_SIZE:image_size" + +# ROI (region of interest) according to landmarks, represented as normalized +# rect. (NormalizedRect) +output_stream: "ROI:roi" + +# Converts results of palm detection into a rectangle (normalized by image size) +# that encloses the palm and is rotated such that the line connecting center of +# the wrist and MCP of the middle finger is aligned with the Y-axis of the +# rectangle. +node { + calculator: "DetectionsToRectsCalculator" + input_stream: "DETECTION:detection" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "NORM_RECT:raw_roi" + options: { + [mediapipe.DetectionsToRectsCalculatorOptions.ext] { + rotation_vector_start_keypoint_index: 0 # Center of wrist. + rotation_vector_end_keypoint_index: 2 # MCP of middle finger. + rotation_vector_target_angle_degrees: 90 + } + } +} + +# Expands and shifts the rectangle that contains the palm so that it's likely +# to cover the entire hand. +node { + calculator: "RectTransformationCalculator" + input_stream: "NORM_RECT:raw_roi" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "roi" + options: { + [mediapipe.RectTransformationCalculatorOptions.ext] { + scale_x: 2.6 + scale_y: 2.6 + shift_y: -0.5 + square_long: true + } + } +} diff --git a/mediapipe/modules/palm_detection/BUILD b/mediapipe/modules/palm_detection/BUILD new file mode 100644 index 0000000000..8b8c1b5d09 --- /dev/null +++ b/mediapipe/modules/palm_detection/BUILD @@ -0,0 +1,61 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load( + "//mediapipe/framework/tool:mediapipe_graph.bzl", + "mediapipe_simple_subgraph", +) + +licenses(["notice"]) + +package(default_visibility = ["//visibility:public"]) + +exports_files(["palm_detection.tflite"]) + +mediapipe_simple_subgraph( + name = "palm_detection_cpu", + graph = "palm_detection_cpu.pbtxt", + register_as = "PalmDetectionCpu", + deps = [ + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator", + "//mediapipe/calculators/tensor:tensors_to_detections_calculator", + "//mediapipe/calculators/tflite:ssd_anchors_calculator", + "//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator", + "//mediapipe/calculators/util:detection_letterbox_removal_calculator", + "//mediapipe/calculators/util:non_max_suppression_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "palm_detection_gpu", + graph = "palm_detection_gpu.pbtxt", + register_as = "PalmDetectionGpu", + deps = [ + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator", + "//mediapipe/calculators/tensor:tensors_to_detections_calculator", + "//mediapipe/calculators/tflite:ssd_anchors_calculator", + "//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator", + "//mediapipe/calculators/util:detection_letterbox_removal_calculator", + "//mediapipe/calculators/util:non_max_suppression_calculator", + ], +) + +exports_files( + srcs = [ + "palm_detection.tflite", + "palm_detection_labelmap.txt", + ], +) diff --git a/mediapipe/modules/palm_detection/README b/mediapipe/modules/palm_detection/README new file mode 100644 index 0000000000..c7fd610604 --- /dev/null +++ b/mediapipe/modules/palm_detection/README @@ -0,0 +1,7 @@ +# palm_detection + +Subgraphs|Details +:--- | :--- +[`PalmDetectionCpu`](https://github.com/google/mediapipe/tree/master/mediapipe/modules/palm_detection/palm_detection_cpu.pbtxt)| Detects palms/hands. (CPU input.) +[`PalmDetectionGpu`](https://github.com/google/mediapipe/tree/master/mediapipe/modules/palm_detection/palm_detection_gpu.pbtxt)| Detects palms/hands. (GPU input.) + diff --git a/mediapipe/modules/palm_detection/palm_detection.tflite b/mediapipe/modules/palm_detection/palm_detection.tflite new file mode 100755 index 0000000000..71ce962db5 Binary files /dev/null and b/mediapipe/modules/palm_detection/palm_detection.tflite differ diff --git a/mediapipe/modules/palm_detection/palm_detection_cpu.pbtxt b/mediapipe/modules/palm_detection/palm_detection_cpu.pbtxt new file mode 100644 index 0000000000..44d026cf4e --- /dev/null +++ b/mediapipe/modules/palm_detection/palm_detection_cpu.pbtxt @@ -0,0 +1,134 @@ +# MediaPipe graph to detect palms with TensorFlow Lite on CPU. + +type: "PalmDetectionCpu" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" + +# Detected palms. (std::vector) +# NOTE: there will not be an output packet in the DETECTIONS stream for this +# particular timestamp if none of palms detected. However, the MediaPipe +# framework will internally inform the downstream calculators of the absence of +# this packet so that they don't wait for it unnecessarily. +output_stream: "DETECTIONS:detections" + +# Transforms an image into a 128x128 tensor while keeping the aspect ratio, and +# therefore may result in potential letterboxing. +node { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE:image" + output_stream: "TENSORS:input_tensor" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + options: { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 128 + output_tensor_height: 128 + keep_aspect_ratio: true + output_tensor_float_range { + min: -1.0 + max: 1.0 + } + } + } +} +# Generates a single side packet containing a TensorFlow Lite op resolver that +# supports custom ops needed by the model used in this graph. +node { + calculator: "TfLiteCustomOpResolverCalculator" + output_side_packet: "opresolver" +} + +# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensor" + output_stream: "TENSORS:detection_tensors" + input_side_packet: "CUSTOM_OP_RESOLVER:opresolver" + options: { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/modules/palm_detection/palm_detection.tflite" + delegate { xnnpack {} } + } + } +} + +# Generates a single side packet containing a vector of SSD anchors based on +# the specification in the options. +node { + calculator: "SsdAnchorsCalculator" + output_side_packet: "anchors" + options: { + [mediapipe.SsdAnchorsCalculatorOptions.ext] { + num_layers: 4 + min_scale: 0.1484375 + max_scale: 0.75 + input_size_height: 128 + input_size_width: 128 + anchor_offset_x: 0.5 + anchor_offset_y: 0.5 + strides: 8 + strides: 16 + strides: 16 + strides: 16 + aspect_ratios: 1.0 + fixed_anchor_size: true + } + } +} + +# Decodes the detection tensors generated by the TensorFlow Lite model, based on +# the SSD anchors and the specification in the options, into a vector of +# detections. Each detection describes a detected object. +node { + calculator: "TensorsToDetectionsCalculator" + input_stream: "TENSORS:detection_tensors" + input_side_packet: "ANCHORS:anchors" + output_stream: "DETECTIONS:unfiltered_detections" + options: { + [mediapipe.TensorsToDetectionsCalculatorOptions.ext] { + num_classes: 1 + num_boxes: 896 + num_coords: 18 + box_coord_offset: 0 + keypoint_coord_offset: 4 + num_keypoints: 7 + num_values_per_keypoint: 2 + sigmoid_score: true + score_clipping_thresh: 100.0 + reverse_output_order: true + + x_scale: 128.0 + y_scale: 128.0 + h_scale: 128.0 + w_scale: 128.0 + min_score_thresh: 0.5 + } + } +} + +# Performs non-max suppression to remove excessive detections. +node { + calculator: "NonMaxSuppressionCalculator" + input_stream: "unfiltered_detections" + output_stream: "filtered_detections" + options: { + [mediapipe.NonMaxSuppressionCalculatorOptions.ext] { + min_suppression_threshold: 0.3 + overlap_type: INTERSECTION_OVER_UNION + algorithm: WEIGHTED + } + } +} + +# Adjusts detection locations (already normalized to [0.f, 1.f]) on the +# letterboxed image (after image transformation with the FIT scale mode) to the +# corresponding locations on the same image with the letterbox removed (the +# input image to the graph before image transformation). +node { + calculator: "DetectionLetterboxRemovalCalculator" + input_stream: "DETECTIONS:filtered_detections" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "DETECTIONS:detections" +} diff --git a/mediapipe/modules/palm_detection/palm_detection_gpu.pbtxt b/mediapipe/modules/palm_detection/palm_detection_gpu.pbtxt new file mode 100644 index 0000000000..a6c773f7f9 --- /dev/null +++ b/mediapipe/modules/palm_detection/palm_detection_gpu.pbtxt @@ -0,0 +1,140 @@ +# MediaPipe graph to detect palms with TensorFlow Lite on GPU. + +type: "PalmDetectionGpu" + +# GPU image. (GpuBuffer) +input_stream: "IMAGE:image" + +# Detected palms. (std::vector) +# NOTE: there will not be an output packet in the DETECTIONS stream for this +# particular timestamp if none of palms detected. However, the MediaPipe +# framework will internally inform the downstream calculators of the absence of +# this packet so that they don't wait for it unnecessarily. +output_stream: "DETECTIONS:detections" + +# Transforms an image into a 256x256 tensor while keeping the aspect ratio, and +# therefore may result in potential letterboxing. +node { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE_GPU:image" + output_stream: "TENSORS:input_tensor" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + options: { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 128 + output_tensor_height: 128 + keep_aspect_ratio: true + output_tensor_float_range { + min: -1.0 + max: 1.0 + } + gpu_origin: TOP_LEFT + } + } +} +# Generates a single side packet containing a TensorFlow Lite op resolver that +# supports custom ops needed by the model used in this graph. +node { + calculator: "TfLiteCustomOpResolverCalculator" + output_side_packet: "opresolver" + options: { + [mediapipe.TfLiteCustomOpResolverCalculatorOptions.ext] { + use_gpu: true + } + } +} + +# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensor" + output_stream: "TENSORS:detection_tensors" + input_side_packet: "CUSTOM_OP_RESOLVER:opresolver" + options: { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/modules/palm_detection/palm_detection.tflite" + use_gpu: true + } + } +} + +# Generates a single side packet containing a vector of SSD anchors based on +# the specification in the options. +node { + calculator: "SsdAnchorsCalculator" + output_side_packet: "anchors" + options: { + [mediapipe.SsdAnchorsCalculatorOptions.ext] { + num_layers: 4 + min_scale: 0.1484375 + max_scale: 0.75 + input_size_height: 128 + input_size_width: 128 + anchor_offset_x: 0.5 + anchor_offset_y: 0.5 + strides: 8 + strides: 16 + strides: 16 + strides: 16 + aspect_ratios: 1.0 + fixed_anchor_size: true + } + } +} + +# Decodes the detection tensors generated by the TensorFlow Lite model, based on +# the SSD anchors and the specification in the options, into a vector of +# detections. Each detection describes a detected object. +node { + calculator: "TensorsToDetectionsCalculator" + input_stream: "TENSORS:detection_tensors" + input_side_packet: "ANCHORS:anchors" + output_stream: "DETECTIONS:unfiltered_detections" + options: { + [mediapipe.TensorsToDetectionsCalculatorOptions.ext] { + num_classes: 1 + num_boxes: 896 + num_coords: 18 + box_coord_offset: 0 + keypoint_coord_offset: 4 + num_keypoints: 7 + num_values_per_keypoint: 2 + sigmoid_score: true + score_clipping_thresh: 100.0 + reverse_output_order: true + + x_scale: 128.0 + y_scale: 128.0 + h_scale: 128.0 + w_scale: 128.0 + min_score_thresh: 0.5 + } + } +} + +# Performs non-max suppression to remove excessive detections. +node { + calculator: "NonMaxSuppressionCalculator" + input_stream: "unfiltered_detections" + output_stream: "filtered_detections" + options: { + [mediapipe.NonMaxSuppressionCalculatorOptions.ext] { + min_suppression_threshold: 0.3 + overlap_type: INTERSECTION_OVER_UNION + algorithm: WEIGHTED + } + } +} + +# Adjusts detection locations (already normalized to [0.f, 1.f]) on the +# letterboxed image (after image transformation with the FIT scale mode) to the +# corresponding locations on the same image with the letterbox removed (the +# input image to the graph before image transformation). +node { + calculator: "DetectionLetterboxRemovalCalculator" + input_stream: "DETECTIONS:filtered_detections" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "DETECTIONS:detections" +} diff --git a/mediapipe/modules/pose_detection/BUILD b/mediapipe/modules/pose_detection/BUILD index 60d4f6763e..f4603007e9 100644 --- a/mediapipe/modules/pose_detection/BUILD +++ b/mediapipe/modules/pose_detection/BUILD @@ -26,11 +26,10 @@ mediapipe_simple_subgraph( graph = "pose_detection_cpu.pbtxt", register_as = "PoseDetectionCpu", deps = [ - "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator", + "//mediapipe/calculators/tensor:tensors_to_detections_calculator", "//mediapipe/calculators/tflite:ssd_anchors_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", "//mediapipe/calculators/util:detection_letterbox_removal_calculator", "//mediapipe/calculators/util:non_max_suppression_calculator", ], @@ -41,11 +40,10 @@ mediapipe_simple_subgraph( graph = "pose_detection_gpu.pbtxt", register_as = "PoseDetectionGpu", deps = [ - "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator", + "//mediapipe/calculators/tensor:tensors_to_detections_calculator", "//mediapipe/calculators/tflite:ssd_anchors_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", "//mediapipe/calculators/util:detection_letterbox_removal_calculator", "//mediapipe/calculators/util:non_max_suppression_calculator", ], diff --git a/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt b/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt index a0e6a152ca..2c3cdc298c 100644 --- a/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt +++ b/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt @@ -36,40 +36,36 @@ input_stream: "IMAGE:image" # this packet so that they don't wait for it unnecessarily. output_stream: "DETECTIONS:detections" -# Transforms the input image on CPU to a 128x128 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. +# Transforms the input image into a 128x128 while keeping the aspect ratio +# (what is expected by the corresponding model), resulting in potential +# letterboxing in the transformed image. node: { - calculator: "ImageTransformationCalculator" + calculator: "ImageToTensorCalculator" input_stream: "IMAGE:image" - output_stream: "IMAGE:transformed_image" + output_stream: "TENSORS:input_tensors" output_stream: "LETTERBOX_PADDING:letterbox_padding" options: { - [mediapipe.ImageTransformationCalculatorOptions.ext] { - output_width: 128 - output_height: 128 - scale_mode: FIT + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 128 + output_tensor_height: 128 + keep_aspect_ratio: true + output_tensor_float_range { + min: -1.0 + max: 1.0 + } } } } -# Converts the transformed input image on CPU into an image tensor stored as a -# TfLiteTensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE:transformed_image" - output_stream: "TENSORS:input_tensors" -} - # Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a # vector of tensors representing, for instance, detection boxes/keypoints and # scores. node { - calculator: "TfLiteInferenceCalculator" + calculator: "InferenceCalculator" input_stream: "TENSORS:input_tensors" output_stream: "TENSORS:detection_tensors" options: { - [mediapipe.TfLiteInferenceCalculatorOptions.ext] { + [mediapipe.InferenceCalculatorOptions.ext] { model_path: "mediapipe/modules/pose_detection/pose_detection.tflite" delegate { xnnpack {} } } @@ -104,12 +100,12 @@ node { # the SSD anchors and the specification in the options, into a vector of # detections. Each detection describes a detected object. node { - calculator: "TfLiteTensorsToDetectionsCalculator" + calculator: "TensorsToDetectionsCalculator" input_stream: "TENSORS:detection_tensors" input_side_packet: "ANCHORS:anchors" output_stream: "DETECTIONS:unfiltered_detections" options: { - [mediapipe.TfLiteTensorsToDetectionsCalculatorOptions.ext] { + [mediapipe.TensorsToDetectionsCalculatorOptions.ext] { num_classes: 1 num_boxes: 896 num_coords: 12 diff --git a/mediapipe/modules/pose_detection/pose_detection_gpu.pbtxt b/mediapipe/modules/pose_detection/pose_detection_gpu.pbtxt index b75397bc29..fb7fe61170 100644 --- a/mediapipe/modules/pose_detection/pose_detection_gpu.pbtxt +++ b/mediapipe/modules/pose_detection/pose_detection_gpu.pbtxt @@ -36,41 +36,37 @@ input_stream: "IMAGE:image" # this packet so that they don't wait for it unnecessarily. output_stream: "DETECTIONS:detections" -# Transforms the input image on GPU to a 128x128 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. +# Transforms the input image into a 128x128 while keeping the aspect ratio +# (what is expected by the corresponding model), resulting in potential +# letterboxing in the transformed image. node: { - calculator: "ImageTransformationCalculator" + calculator: "ImageToTensorCalculator" input_stream: "IMAGE_GPU:image" - output_stream: "IMAGE_GPU:transformed_image" + output_stream: "TENSORS:input_tensors" output_stream: "LETTERBOX_PADDING:letterbox_padding" options: { - [mediapipe.ImageTransformationCalculatorOptions.ext] { - output_width: 128 - output_height: 128 - scale_mode: FIT + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 128 + output_tensor_height: 128 + keep_aspect_ratio: true + output_tensor_float_range { + min: -1.0 + max: 1.0 + } + gpu_origin: TOP_LEFT } } } -# Converts the transformed input image on GPU into an image tensor stored as a -# TfLiteTensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE_GPU:transformed_image" - output_stream: "TENSORS_GPU:input_tensors" -} - -# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a +# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a # vector of tensors representing, for instance, detection boxes/keypoints and # scores. node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS_GPU:input_tensors" - # TODO: we can use TENSORS_GPU here and in the downstream calculator + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensors" output_stream: "TENSORS:detection_tensors" options: { - [mediapipe.TfLiteInferenceCalculatorOptions.ext] { + [mediapipe.InferenceCalculatorOptions.ext] { model_path: "mediapipe/modules/pose_detection/pose_detection.tflite" } } @@ -104,12 +100,12 @@ node { # the SSD anchors and the specification in the options, into a vector of # detections. Each detection describes a detected object. node { - calculator: "TfLiteTensorsToDetectionsCalculator" + calculator: "TensorsToDetectionsCalculator" input_stream: "TENSORS:detection_tensors" input_side_packet: "ANCHORS:anchors" output_stream: "DETECTIONS:unfiltered_detections" options: { - [mediapipe.TfLiteTensorsToDetectionsCalculatorOptions.ext] { + [mediapipe.TensorsToDetectionsCalculatorOptions.ext] { num_classes: 1 num_boxes: 896 num_coords: 12 diff --git a/mediapipe/modules/pose_landmark/BUILD b/mediapipe/modules/pose_landmark/BUILD index 70de124aba..32cb33ba6e 100644 --- a/mediapipe/modules/pose_landmark/BUILD +++ b/mediapipe/modules/pose_landmark/BUILD @@ -27,13 +27,12 @@ mediapipe_simple_subgraph( register_as = "PoseLandmarkUpperBodyByRoiGpu", deps = [ "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/core:split_normalized_landmark_list_calculator", "//mediapipe/calculators/core:split_vector_calculator", - "//mediapipe/calculators/image:image_cropping_calculator", - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_floats_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_landmarks_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator", + "//mediapipe/calculators/tensor:tensors_to_floats_calculator", + "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator", "//mediapipe/calculators/util:landmark_letterbox_removal_calculator", "//mediapipe/calculators/util:landmark_projection_calculator", "//mediapipe/calculators/util:thresholding_calculator", @@ -46,13 +45,12 @@ mediapipe_simple_subgraph( register_as = "PoseLandmarkUpperBodyByRoiCpu", deps = [ "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/core:split_normalized_landmark_list_calculator", "//mediapipe/calculators/core:split_vector_calculator", - "//mediapipe/calculators/image:image_cropping_calculator", - "//mediapipe/calculators/image:image_transformation_calculator", - "//mediapipe/calculators/tflite:tflite_converter_calculator", - "//mediapipe/calculators/tflite:tflite_inference_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_floats_calculator", - "//mediapipe/calculators/tflite:tflite_tensors_to_landmarks_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator", + "//mediapipe/calculators/tensor:tensors_to_floats_calculator", + "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator", "//mediapipe/calculators/util:landmark_letterbox_removal_calculator", "//mediapipe/calculators/util:landmark_projection_calculator", "//mediapipe/calculators/util:thresholding_calculator", @@ -73,6 +71,7 @@ mediapipe_simple_subgraph( "//mediapipe/calculators/core:previous_loopback_calculator", "//mediapipe/calculators/core:split_vector_calculator", "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/util:logic_calculator", "//mediapipe/modules/pose_detection:pose_detection_gpu", ], ) @@ -91,10 +90,33 @@ mediapipe_simple_subgraph( "//mediapipe/calculators/core:previous_loopback_calculator", "//mediapipe/calculators/core:split_vector_calculator", "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/util:logic_calculator", "//mediapipe/modules/pose_detection:pose_detection_cpu", ], ) +mediapipe_simple_subgraph( + name = "pose_landmark_upper_body_smoothed_cpu", + graph = "pose_landmark_upper_body_smoothed_cpu.pbtxt", + register_as = "PoseLandmarkUpperBodySmoothedCpu", + deps = [ + ":pose_landmark_upper_body_cpu", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/util:landmarks_smoothing_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "pose_landmark_upper_body_smoothed_gpu", + graph = "pose_landmark_upper_body_smoothed_gpu.pbtxt", + register_as = "PoseLandmarkUpperBodySmoothedGpu", + deps = [ + ":pose_landmark_upper_body_gpu", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/util:landmarks_smoothing_calculator", + ], +) + exports_files( srcs = [ "pose_landmark_upper_body.tflite", @@ -116,7 +138,6 @@ mediapipe_simple_subgraph( graph = "pose_landmark_upper_body_landmarks_to_roi.pbtxt", register_as = "PoseLandmarkUpperBodyLandmarksToRoi", deps = [ - "//mediapipe/calculators/core:split_vector_calculator", "//mediapipe/calculators/util:alignment_points_to_rects_calculator", "//mediapipe/calculators/util:landmarks_to_detection_calculator", "//mediapipe/calculators/util:rect_transformation_calculator", diff --git a/mediapipe/modules/pose_landmark/pose_landmark_upper_body_by_roi_cpu.pbtxt b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_by_roi_cpu.pbtxt index 6a557ae208..df0c7402c4 100644 --- a/mediapipe/modules/pose_landmark/pose_landmark_upper_body_by_roi_cpu.pbtxt +++ b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_by_roi_cpu.pbtxt @@ -22,9 +22,7 @@ input_stream: "IMAGE:image" input_stream: "ROI:roi" # Pose landmarks within the given ROI. (NormalizedLandmarkList) -# We have 25 (upper-body) landmarks -# (see pose_landmark_upper_body_topology.svg), and there are other auxiliary key -# points. +# We have 25 (upper-body) landmarks (see pose_landmark_upper_body_topology.svg). # 0 - nose # 1 - right eye (inner) # 2 - right eye @@ -56,58 +54,39 @@ input_stream: "ROI:roi" # the MediaPipe framework will internally inform the downstream calculators of # the absence of this packet so that they don't wait for it unnecessarily. output_stream: "LANDMARKS:landmarks" +# Auxiliary landmarks for deriving the ROI in the subsequent image. +# (NormalizedLandmarkList) +output_stream: "AUXILIARY_LANDMARKS:auxiliary_landmarks" -# Crops the rectangle that contains a pose from the input image. -node { - calculator: "ImageCroppingCalculator" +# Transforms the input image into a 256x256 tensor while keeping the aspect +# ratio (what is expected by the corresponding model), resulting in potential +# letterboxing in the transformed image. +node: { + calculator: "ImageToTensorCalculator" input_stream: "IMAGE:image" input_stream: "NORM_RECT:roi" - output_stream: "IMAGE:pose_region" - options: { - [mediapipe.ImageCroppingCalculatorOptions.ext] { - border_mode: BORDER_REPLICATE - output_max_width: 256 - output_max_height: 256 - } - } -} - -# Transforms the input image on CPU to a 256x256 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE:pose_region" - output_stream: "IMAGE:transformed_pose_region" - output_stream: "LETTERBOX_PADDING:letterbox_padding" - options: { - [mediapipe.ImageTransformationCalculatorOptions.ext] { - output_width: 256 - output_height: 256 - scale_mode: FIT - } - } -} - -# Converts the transformed input image on CPU into a tensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE:transformed_pose_region" output_stream: "TENSORS:input_tensors" + output_stream: "LETTERBOX_PADDING:letterbox_padding" options: { - [mediapipe.TfLiteConverterCalculatorOptions.ext] { - zero_center: false + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 256 + output_tensor_height: 256 + keep_aspect_ratio: true + output_tensor_float_range { + min: 0.0 + max: 1.0 + } } } } # Runs a TensorFlow Lite model inference on CPU. node { - calculator: "TfLiteInferenceCalculator" + calculator: "InferenceCalculator" input_stream: "TENSORS:input_tensors" output_stream: "TENSORS:output_tensors" options: { - [mediapipe.TfLiteInferenceCalculatorOptions.ext] { + [mediapipe.InferenceCalculatorOptions.ext] { model_path: "mediapipe/modules/pose_landmark/pose_landmark_upper_body.tflite" delegate { xnnpack {} } } @@ -117,7 +96,7 @@ node { # Splits a vector of TFLite tensors to multiple vectors according to the ranges # specified in option. node { - calculator: "SplitTfLiteTensorVectorCalculator" + calculator: "SplitTensorVectorCalculator" input_stream: "output_tensors" output_stream: "landmark_tensors" output_stream: "pose_flag_tensor" @@ -132,7 +111,7 @@ node { # Converts the pose-flag tensor into a float that represents the confidence # score of pose presence. node { - calculator: "TfLiteTensorsToFloatsCalculator" + calculator: "TensorsToFloatsCalculator" input_stream: "TENSORS:pose_flag_tensor" output_stream: "FLOAT:pose_presence_score" } @@ -150,7 +129,7 @@ node { } } -# Drop landmarks tensors if pose is not present. +# Drops landmark tensors if pose is not present. node { calculator: "GateCalculator" input_stream: "landmark_tensors" @@ -158,15 +137,15 @@ node { output_stream: "ensured_landmark_tensors" } -# Decodes the landmark tensors into a vector of lanmarks, where the landmark +# Decodes the landmark tensors into a vector of landmarks, where the landmark # coordinates are normalized by the size of the input image to the model. node { - calculator: "TfLiteTensorsToLandmarksCalculator" + calculator: "TensorsToLandmarksCalculator" input_stream: "TENSORS:ensured_landmark_tensors" output_stream: "NORM_LANDMARKS:raw_landmarks" options: { - [mediapipe.TfLiteTensorsToLandmarksCalculatorOptions.ext] { - num_landmarks: 31 + [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { + num_landmarks: 27 input_image_width: 256 input_image_height: 256 } @@ -190,5 +169,20 @@ node { calculator: "LandmarkProjectionCalculator" input_stream: "NORM_LANDMARKS:adjusted_landmarks" input_stream: "NORM_RECT:roi" - output_stream: "NORM_LANDMARKS:landmarks" + output_stream: "NORM_LANDMARKS:all_landmarks" +} + +# Splits the landmarks into two sets: the actual pose landmarks and the +# auxiliary landmarks. +node { + calculator: "SplitNormalizedLandmarkListCalculator" + input_stream: "all_landmarks" + output_stream: "landmarks" + output_stream: "auxiliary_landmarks" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 0 end: 25 } + ranges: { begin: 25 end: 27 } + } + } } diff --git a/mediapipe/modules/pose_landmark/pose_landmark_upper_body_by_roi_gpu.pbtxt b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_by_roi_gpu.pbtxt index aadbd18886..5fa116318e 100644 --- a/mediapipe/modules/pose_landmark/pose_landmark_upper_body_by_roi_gpu.pbtxt +++ b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_by_roi_gpu.pbtxt @@ -22,9 +22,7 @@ input_stream: "IMAGE:image" input_stream: "ROI:roi" # Pose landmarks within the given ROI. (NormalizedLandmarkList) -# We have 25 (upper-body) landmarks -# (see pose_landmark_upper_body_topology.svg), and there are other auxiliary key -# points. +# We have 25 (upper-body) landmarks (see pose_landmark_upper_body_topology.svg). # 0 - nose # 1 - right eye (inner) # 2 - right eye @@ -56,58 +54,40 @@ input_stream: "ROI:roi" # the MediaPipe framework will internally inform the downstream calculators of # the absence of this packet so that they don't wait for it unnecessarily. output_stream: "LANDMARKS:landmarks" +# Auxiliary landmarks for deriving the ROI in the subsequent image. +# (NormalizedLandmarkList) +output_stream: "AUXILIARY_LANDMARKS:auxiliary_landmarks" -# Crops the rectangle that contains a pose from the input image. -node { - calculator: "ImageCroppingCalculator" +# Transforms the input image into a 256x256 tensor while keeping the aspect +# ratio (what is expected by the corresponding model), resulting in potential +# letterboxing in the transformed image. +node: { + calculator: "ImageToTensorCalculator" input_stream: "IMAGE_GPU:image" input_stream: "NORM_RECT:roi" - output_stream: "IMAGE_GPU:pose_region" - options: { - [mediapipe.ImageCroppingCalculatorOptions.ext] { - border_mode: BORDER_REPLICATE - output_max_width: 256 - output_max_height: 256 - } - } -} - -# Transforms the input image on GPU to a 256x256 image. To scale the input -# image, the scale_mode option is set to FIT to preserve the aspect ratio, -# resulting in potential letterboxing in the transformed image. -node: { - calculator: "ImageTransformationCalculator" - input_stream: "IMAGE_GPU:pose_region" - output_stream: "IMAGE_GPU:transformed_pose_region" + output_stream: "TENSORS:input_tensors" output_stream: "LETTERBOX_PADDING:letterbox_padding" options: { - [mediapipe.ImageTransformationCalculatorOptions.ext] { - output_width: 256 - output_height: 256 - scale_mode: FIT - } - } -} - -# Converts the transformed input image on GPU into a tensor. -node { - calculator: "TfLiteConverterCalculator" - input_stream: "IMAGE_GPU:transformed_pose_region" - output_stream: "TENSORS_GPU:input_tensors" - options: { - [mediapipe.TfLiteConverterCalculatorOptions.ext] { - zero_center: false + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 256 + output_tensor_height: 256 + keep_aspect_ratio: true + output_tensor_float_range { + min: 0.0 + max: 1.0 + } + gpu_origin: TOP_LEFT } } } # Runs a TensorFlow Lite model inference on GPU. node { - calculator: "TfLiteInferenceCalculator" - input_stream: "TENSORS_GPU:input_tensors" + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensors" output_stream: "TENSORS:output_tensors" options: { - [mediapipe.TfLiteInferenceCalculatorOptions.ext] { + [mediapipe.InferenceCalculatorOptions.ext] { model_path: "mediapipe/modules/pose_landmark/pose_landmark_upper_body.tflite" } } @@ -116,7 +96,7 @@ node { # Splits a vector of TFLite tensors to multiple vectors according to the ranges # specified in option. node { - calculator: "SplitTfLiteTensorVectorCalculator" + calculator: "SplitTensorVectorCalculator" input_stream: "output_tensors" output_stream: "landmark_tensors" output_stream: "pose_flag_tensor" @@ -131,7 +111,7 @@ node { # Converts the pose-flag tensor into a float that represents the confidence # score of pose presence. node { - calculator: "TfLiteTensorsToFloatsCalculator" + calculator: "TensorsToFloatsCalculator" input_stream: "TENSORS:pose_flag_tensor" output_stream: "FLOAT:pose_presence_score" } @@ -149,7 +129,7 @@ node { } } -# Drop landmarks tensors if pose is not present. +# Drops landmark tensors if pose is not present. node { calculator: "GateCalculator" input_stream: "landmark_tensors" @@ -157,15 +137,15 @@ node { output_stream: "ensured_landmark_tensors" } -# Decodes the landmark tensors into a vector of lanmarks, where the landmark +# Decodes the landmark tensors into a vector of landmarks, where the landmark # coordinates are normalized by the size of the input image to the model. node { - calculator: "TfLiteTensorsToLandmarksCalculator" + calculator: "TensorsToLandmarksCalculator" input_stream: "TENSORS:ensured_landmark_tensors" output_stream: "NORM_LANDMARKS:raw_landmarks" options: { - [mediapipe.TfLiteTensorsToLandmarksCalculatorOptions.ext] { - num_landmarks: 31 + [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { + num_landmarks: 27 input_image_width: 256 input_image_height: 256 } @@ -189,5 +169,20 @@ node { calculator: "LandmarkProjectionCalculator" input_stream: "NORM_LANDMARKS:adjusted_landmarks" input_stream: "NORM_RECT:roi" - output_stream: "NORM_LANDMARKS:landmarks" + output_stream: "NORM_LANDMARKS:all_landmarks" +} + +# Splits the landmarks into two sets: the actual pose landmarks and the +# auxiliary landmarks. +node { + calculator: "SplitNormalizedLandmarkListCalculator" + input_stream: "all_landmarks" + output_stream: "landmarks" + output_stream: "auxiliary_landmarks" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 0 end: 25 } + ranges: { begin: 25 end: 27 } + } + } } diff --git a/mediapipe/modules/pose_landmark/pose_landmark_upper_body_cpu.pbtxt b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_cpu.pbtxt index 2193e33175..bb7ed52880 100644 --- a/mediapipe/modules/pose_landmark/pose_landmark_upper_body_cpu.pbtxt +++ b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_cpu.pbtxt @@ -22,10 +22,12 @@ type: "PoseLandmarkUpperBodyCpu" # CPU image. (ImageFrame) input_stream: "IMAGE:image" +# Whether pose detection can be skipped when pose regions can already be +# approximated from pose landmarks on the previous frame. +input_side_packet: "CAN_SKIP_DETECTION:can_skip_detection" + # Pose landmarks within the given ROI. (NormalizedLandmarkList) -# We have 25 (upper-body) landmarks -# (see pose_landmark_upper_body_topology.svg), and there are other auxiliary key -# points. +# We have 25 (upper-body) landmarks (see pose_landmark_upper_body_topology.svg). # 0 - nose # 1 - right eye (inner) # 2 - right eye @@ -97,13 +99,23 @@ node { } # Drops the incoming image if PoseLandmarkUpperBodyByRoiCpu was able to identify -# pose presence in the previous image. Otherwise, passes the incoming image -# through to trigger a new round of pose detection in PoseDetectionCpu. +# pose presence in the previous image and skipping pose detection is enabled. +# Otherwise, passes the incoming image through to trigger a new round of pose +# detection in PoseDetectionCpu. +node { + calculator: "LogicCalculator" + options: { + [mediapipe.LogicCalculatorOptions.ext] { op: AND } + } + input_side_packet: "can_skip_detection" + input_stream: "prev_pose_rect_from_landmarks_is_present" + output_stream: "skip_detection" +} node { calculator: "GateCalculator" input_stream: "image" input_stream: "image_size" - input_stream: "DISALLOW:prev_pose_rect_from_landmarks_is_present" + input_stream: "DISALLOW:skip_detection" output_stream: "image_for_pose_detection" output_stream: "image_size_for_pose_detection" options: { @@ -158,13 +170,14 @@ node { input_stream: "IMAGE:image" input_stream: "ROI:pose_rect" output_stream: "LANDMARKS:pose_landmarks" + output_stream: "AUXILIARY_LANDMARKS:auxiliary_landmarks" } -# Calculates region of interest based on pose landmarks, so that can be reused -# for subsequent image. +# Calculates region of interest based on the auxiliary landmarks, to be used in +# the subsequent image. node { calculator: "PoseLandmarkUpperBodyLandmarksToRoi" - input_stream: "LANDMARKS:pose_landmarks" + input_stream: "LANDMARKS:auxiliary_landmarks" input_stream: "IMAGE_SIZE:image_size" output_stream: "ROI:pose_rect_from_landmarks" } diff --git a/mediapipe/modules/pose_landmark/pose_landmark_upper_body_gpu.pbtxt b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_gpu.pbtxt index 5666b2d9e3..8369d26151 100644 --- a/mediapipe/modules/pose_landmark/pose_landmark_upper_body_gpu.pbtxt +++ b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_gpu.pbtxt @@ -22,10 +22,12 @@ type: "PoseLandmarkUpperBodyGpu" # GPU image. (GpuBuffer) input_stream: "IMAGE:image" +# Whether pose detection can be skipped when pose regions can already be +# approximated from pose landmarks on the previous frame. +input_side_packet: "CAN_SKIP_DETECTION:can_skip_detection" + # Pose landmarks within the given ROI. (NormalizedLandmarkList) -# We have 25 (upper-body) landmarks -# (see pose_landmark_upper_body_topology.svg), and there are other auxiliary key -# points. +# We have 25 (upper-body) landmarks (see pose_landmark_upper_body_topology.svg). # 0 - nose # 1 - right eye (inner) # 2 - right eye @@ -97,13 +99,23 @@ node { } # Drops the incoming image if PoseLandmarkUpperBodyByRoiGpu was able to identify -# pose presence in the previous image. Otherwise, passes the incoming image -# through to trigger a new round of pose detection in PoseDetectionGpu. +# pose presence in the previous image and skipping pose detection is enabled. +# Otherwise, passes the incoming image through to trigger a new round of pose +# detection in PoseDetectionGpu. +node { + calculator: "LogicCalculator" + options: { + [mediapipe.LogicCalculatorOptions.ext] { op: AND } + } + input_side_packet: "can_skip_detection" + input_stream: "prev_pose_rect_from_landmarks_is_present" + output_stream: "skip_detection" +} node { calculator: "GateCalculator" input_stream: "image" input_stream: "image_size" - input_stream: "DISALLOW:prev_pose_rect_from_landmarks_is_present" + input_stream: "DISALLOW:skip_detection" output_stream: "image_for_pose_detection" output_stream: "image_size_for_pose_detection" options: { @@ -158,13 +170,14 @@ node { input_stream: "IMAGE:image" input_stream: "ROI:pose_rect" output_stream: "LANDMARKS:pose_landmarks" + output_stream: "AUXILIARY_LANDMARKS:auxiliary_landmarks" } -# Calculates region of interest based on pose landmarks, so that can be reused -# for subsequent image. +# Calculates region of interest based on the auxiliary landmarks, to be used in +# the subsequent image. node { calculator: "PoseLandmarkUpperBodyLandmarksToRoi" - input_stream: "LANDMARKS:pose_landmarks" + input_stream: "LANDMARKS:auxiliary_landmarks" input_stream: "IMAGE_SIZE:image_size" output_stream: "ROI:pose_rect_from_landmarks" } diff --git a/mediapipe/modules/pose_landmark/pose_landmark_upper_body_landmarks_to_roi.pbtxt b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_landmarks_to_roi.pbtxt index 987de9958b..76854d575b 100644 --- a/mediapipe/modules/pose_landmark/pose_landmark_upper_body_landmarks_to_roi.pbtxt +++ b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_landmarks_to_roi.pbtxt @@ -13,21 +13,10 @@ input_stream: "IMAGE_SIZE:image_size" # ROI according to landmarks. (NormalizedRect) output_stream: "ROI:roi" -node { - calculator: "SplitNormalizedLandmarkListCalculator" - input_stream: "landmarks" - output_stream: "alignment_landmarks" - options: { - [mediapipe.SplitVectorCalculatorOptions.ext] { - ranges: { begin: 25 end: 27 } - } - } -} - # Converts landmarks to a detection that tightly encloses all landmarks. node { calculator: "LandmarksToDetectionCalculator" - input_stream: "NORM_LANDMARKS:alignment_landmarks" + input_stream: "NORM_LANDMARKS:landmarks" output_stream: "DETECTION:detection" } diff --git a/mediapipe/modules/pose_landmark/pose_landmark_upper_body_smoothed_cpu.pbtxt b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_smoothed_cpu.pbtxt new file mode 100644 index 0000000000..466d320439 --- /dev/null +++ b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_smoothed_cpu.pbtxt @@ -0,0 +1,108 @@ +# Experimental: Adds additional temporal filtering of the landmarks as a post +# processing step to reduce jitter. +# +# MediaPipe graph to detect/predict pose landmarks. (CPU input, and inference is +# executed on CPU.) This graph tries to skip pose detection as much as possible +# by using previously detected/predicted landmarks for new images. +# +# It is required that "pose_detection.tflite" is available at +# "mediapipe/modules/pose_detection/pose_detection.tflite" +# path during execution. +# +# It is required that "pose_landmark_upper_body.tflite" is available at +# "mediapipe/modules/pose_landmark/pose_landmark_upper_body.tflite" +# path during execution. +# +# EXAMPLE: +# node { +# calculator: "PoseLandmarkUpperBodySmoothedCpu" +# input_stream: "IMAGE:image" +# output_stream: "LANDMARKS:pose_landmarks" +# } + +type: "PoseLandmarkUpperBodySmoothedCpu" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" + +# Whether pose detection can be skipped when pose regions can already be +# approximated from pose landmarks on the previous frame. +input_side_packet: "CAN_SKIP_DETECTION:can_skip_detection" + +# The pose landmarks within the given ROI. (NormalizedLandmarkList) +# We have 25 (upper-body) landmarks +# (see pose_landmark_upper_body_topology.svg), and there are other auxiliary key +# points. +# 0 - nose +# 1 - right eye (inner) +# 2 - right eye +# 3 - right eye (outer) +# 4 - left eye (inner) +# 5 - left eye +# 6 - left eye (outer) +# 7 - right ear +# 8 - left ear +# 9 - mouth (right) +# 10 - mouth (left) +# 11 - right shoulder +# 12 - left shoulder +# 13 - right elbow +# 14 - left elbow +# 15 - right wrist +# 16 - left wrist +# 17 - right pinky +# 18 - left pinky +# 19 - right index +# 20 - left index +# 21 - right thumb +# 22 - left thumb +# 23 - right hip +# 24 - left hip +# +# NOTE: if a pose is not present within the given ROI, for this particular +# timestamp there will not be an output packet in the LANDMARKS stream. However, +# the MediaPipe framework will internally inform the downstream calculators of +# the absence of this packet so that they don't wait for it unnecessarily. +output_stream: "LANDMARKS:pose_landmarks" + +# Extra outputs (for debugging, for instance). +# Detected poses. (Detection) +output_stream: "DETECTION:pose_detection" +# Regions of interest calculated based on landmarks. (NormalizedRect) +output_stream: "ROI_FROM_LANDMARKS:pose_rect_from_landmarks" +# Regions of interest calculated based on pose detections. (NormalizedRect) +output_stream: "ROI_FROM_DETECTION:pose_rect_from_detection" + +# Subgraph that detects poses and corresponding landmarks. +node { + calculator: "PoseLandmarkUpperBodyCpu" + input_stream: "IMAGE:image" + output_stream: "LANDMARKS:unsmoothed_pose_landmarks" + output_stream: "DETECTION:pose_detection" + output_stream: "ROI_FROM_LANDMARKS:pose_rect_from_landmarks" + output_stream: "ROI_FROM_DETECTION:pose_rect_from_detection" + input_side_packet: "CAN_SKIP_DETECTION:can_skip_detection" +} + +# Calculates size of the image. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE:image" + output_stream: "SIZE:image_size" +} + +# Smoothes pose landmarks in order to reduce jitter. +node { + calculator: "LandmarksSmoothingCalculator" + input_stream: "NORM_LANDMARKS:unsmoothed_pose_landmarks" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "NORM_FILTERED_LANDMARKS:pose_landmarks" + options: { + [mediapipe.LandmarksSmoothingCalculatorOptions.ext] { + velocity_filter: { + window_size: 5 + velocity_scale: 10.0 + } + } + } +} diff --git a/mediapipe/modules/pose_landmark/pose_landmark_upper_body_smoothed_gpu.pbtxt b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_smoothed_gpu.pbtxt new file mode 100644 index 0000000000..22171912a8 --- /dev/null +++ b/mediapipe/modules/pose_landmark/pose_landmark_upper_body_smoothed_gpu.pbtxt @@ -0,0 +1,109 @@ +# Experimental: Adds additional temporal filtering of the landmarks as a post +# processing step to reduce jitter. +# +# MediaPipe graph to detect/predict pose landmarks. (GPU input, and inference is +# executed on GPU.) This graph tries to skip pose detection as much as possible +# by using previously detected/predicted landmarks for new images. +# +# It is required that "pose_detection.tflite" is available at +# "mediapipe/modules/pose_detection/pose_detection.tflite" +# path during execution. +# +# It is required that "pose_landmark_upper_body.tflite" is available at +# "mediapipe/modules/pose_landmark/pose_landmark_upper_body.tflite" +# path during execution. +# +# EXAMPLE: +# node { +# calculator: "PoseLandmarkUpperBodySmoothedGpu" +# input_stream: "IMAGE:image" +# output_stream: "LANDMARKS:pose_landmarks" +# } + +type: "PoseLandmarkUpperBodySmoothedGpu" + +# GPU image. (GpuBuffer) +input_stream: "IMAGE:image" + +# Whether pose detection can be skipped when pose regions can already be +# approximated from pose landmarks on the previous frame. +input_side_packet: "CAN_SKIP_DETECTION:can_skip_detection" + +# Pose landmarks within the given ROI. (NormalizedLandmarkList) +# We have 25 (upper-body) landmarks +# (see pose_landmark_upper_body_topology.svg), and there are other auxiliary key +# points. +# 0 - nose +# 1 - right eye (inner) +# 2 - right eye +# 3 - right eye (outer) +# 4 - left eye (inner) +# 5 - left eye +# 6 - left eye (outer) +# 7 - right ear +# 8 - left ear +# 9 - mouth (right) +# 10 - mouth (left) +# 11 - right shoulder +# 12 - left shoulder +# 13 - right elbow +# 14 - left elbow +# 15 - right wrist +# 16 - left wrist +# 17 - right pinky +# 18 - left pinky +# 19 - right index +# 20 - left index +# 21 - right thumb +# 22 - left thumb +# 23 - right hip +# 24 - left hip +# +# NOTE: if a pose is not present within the given ROI, for this particular +# timestamp there will not be an output packet in the LANDMARKS stream. However, +# the MediaPipe framework will internally inform the downstream calculators of +# the absence of this packet so that they don't wait for it unnecessarily. +output_stream: "LANDMARKS:pose_landmarks" + +# Extra outputs (for debugging, for instance). +# Detected poses. (Detection) +output_stream: "DETECTION:pose_detection" +# Regions of interest calculated based on landmarks. (NormalizedRect) +output_stream: "ROI_FROM_LANDMARKS:pose_rect_from_landmarks" +# Regions of interest calculated based on pose detections. (NormalizedRect) +output_stream: "ROI_FROM_DETECTION:pose_rect_from_detection" + +# Subgraph that detects poses and corresponding landmarks. +node { + calculator: "PoseLandmarkUpperBodyGpu" + input_stream: "IMAGE:image" + output_stream: "LANDMARKS:unsmoothed_pose_landmarks" + output_stream: "DETECTION:pose_detection" + output_stream: "ROI_FROM_LANDMARKS:pose_rect_from_landmarks" + output_stream: "ROI_FROM_DETECTION:pose_rect_from_detection" + input_side_packet: "CAN_SKIP_DETECTION:can_skip_detection" +} + +# Calculates size of the image. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE_GPU:image" + output_stream: "SIZE:image_size" +} + +# Smoothes pose landmarks in order to reduce jitter. +node { + calculator: "LandmarksSmoothingCalculator" + input_stream: "NORM_LANDMARKS:unsmoothed_pose_landmarks" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "NORM_FILTERED_LANDMARKS:pose_landmarks" + options: { + [mediapipe.LandmarksSmoothingCalculatorOptions.ext] { + velocity_filter: { + window_size: 5 + velocity_scale: 10.0 + } + } + } +} + diff --git a/mediapipe/python/BUILD b/mediapipe/python/BUILD index 1e54d56ee4..b8cdefcebd 100644 --- a/mediapipe/python/BUILD +++ b/mediapipe/python/BUILD @@ -16,14 +16,6 @@ load("@pybind11_bazel//:build_defs.bzl", "pybind_extension") licenses(["notice"]) # Apache 2.0 -cc_library( - name = "builtin_calculators", - deps = [ - "//mediapipe/calculators/core:pass_through_calculator", - "//mediapipe/graphs/pose_tracking:upper_body_pose_tracking_cpu_deps", - ], -) - pybind_extension( name = "_framework_bindings", srcs = ["framework_bindings.cc"], @@ -50,5 +42,25 @@ pybind_extension( "//mediapipe/python/pybind:resource_util", "//mediapipe/python/pybind:timestamp", "//mediapipe/python/pybind:validated_graph_config", + # Type registration. + "//mediapipe/framework:basic_types_registration", + "//mediapipe/framework/formats:classification_registration", + "//mediapipe/framework/formats:detection_registration", + "//mediapipe/framework/formats:landmark_registration", + ], +) + +cc_library( + name = "builtin_calculators", + deps = [ + "//mediapipe/calculators/core:pass_through_calculator", + "//mediapipe/calculators/core:split_normalized_landmark_list_calculator", + "//mediapipe/modules/face_detection:face_detection_front_cpu", + "//mediapipe/modules/face_landmark:face_landmark_front_cpu", + "//mediapipe/modules/hand_landmark:hand_landmark_tracking_cpu", + "//mediapipe/modules/palm_detection:palm_detection_cpu", + "//mediapipe/modules/pose_detection:pose_detection_cpu", + "//mediapipe/modules/pose_landmark:pose_landmark_upper_body_by_roi_cpu", + "//mediapipe/modules/pose_landmark:pose_landmark_upper_body_smoothed_cpu", ], ) diff --git a/mediapipe/python/pybind/calculator_graph.cc b/mediapipe/python/pybind/calculator_graph.cc index c977959185..bf86366f05 100644 --- a/mediapipe/python/pybind/calculator_graph.cc +++ b/mediapipe/python/pybind/calculator_graph.cc @@ -31,6 +31,10 @@ namespace mediapipe { namespace python { +// A mutex to guard the output stream observer python callback function. +// Only one python callback can run at once. +absl::Mutex callback_mutex; + template T ParseProto(const py::object& proto_object) { T proto; @@ -393,6 +397,8 @@ void CalculatorGraphSubmodule(pybind11::module* module) { pybind11::function callback_fn) { RaisePyErrorIfNotOk(self->ObserveOutputStream( stream_name, [callback_fn, stream_name](const Packet& packet) { + // Acquire a mutex so that only one callback_fn can run at once. + absl::MutexLock lock(&callback_mutex); callback_fn(stream_name, packet); return mediapipe::OkStatus(); })); diff --git a/mediapipe/python/solution_base.py b/mediapipe/python/solution_base.py new file mode 100644 index 0000000000..2ebdb2cf8d --- /dev/null +++ b/mediapipe/python/solution_base.py @@ -0,0 +1,472 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""MediaPipe SolutionBase module. + +MediaPipe SolutionBase is the common base class for the high-level MediaPipe +Solution APIs such as BlazeFace, hand tracking, and BlazePose. The SolutionBase +class contains the shared logic among the high-level Solution APIs including +graph initialization, processing image/audio data, and graph shutdown. Thus, +users can easily create new MediaPipe Solution APIs on top of the SolutionBase +class. +""" + +import collections +import enum +import os +from typing import Any, Iterable, List, Mapping, NamedTuple, Optional, Union + +import numpy as np + +from google.protobuf import descriptor +# resources dependency +from mediapipe.framework import calculator_pb2 +# pylint: disable=unused-import +from mediapipe.framework.formats import detection_pb2 +from mediapipe.calculators.image import image_transformation_calculator_pb2 +from mediapipe.calculators.tensor import tensors_to_detections_calculator_pb2 +from mediapipe.calculators.util import landmarks_smoothing_calculator_pb2 +from mediapipe.calculators.util import logic_calculator_pb2 +from mediapipe.calculators.util import thresholding_calculator_pb2 +from mediapipe.framework.formats import classification_pb2 +from mediapipe.framework.formats import landmark_pb2 +from mediapipe.framework.formats import rect_pb2 +# pylint: enable=unused-import +from mediapipe.python._framework_bindings import calculator_graph +from mediapipe.python._framework_bindings import image_frame +from mediapipe.python._framework_bindings import packet +from mediapipe.python._framework_bindings import resource_util +from mediapipe.python._framework_bindings import validated_graph_config +import mediapipe.python.packet_creator as packet_creator +import mediapipe.python.packet_getter as packet_getter + +RGB_CHANNELS = 3 +# TODO: Enable calculator options modification for more calculators. +CALCULATOR_TO_OPTIONS = { + 'ImageTransformationCalculator': + image_transformation_calculator_pb2 + .ImageTransformationCalculatorOptions, + 'LandmarksSmoothingCalculator': + landmarks_smoothing_calculator_pb2.LandmarksSmoothingCalculatorOptions, + 'LogicCalculator': + logic_calculator_pb2.LogicCalculatorOptions, + 'ThresholdingCalculator': + thresholding_calculator_pb2.ThresholdingCalculatorOptions, + 'TensorsToDetectionsCalculator': + tensors_to_detections_calculator_pb2 + .TensorsToDetectionsCalculatorOptions, +} + + +# TODO: Support more packet data types, such as "Any" type. +@enum.unique +class _PacketDataType(enum.Enum): + """The packet data types supported by the SolutionBase class.""" + STRING = 'string' + BOOL = 'bool' + INT = 'int' + FLOAT = 'float' + AUDIO = 'matrix' + IMAGE = 'image_frame' + PROTO = 'proto' + PROTO_LIST = 'proto_list' + + @staticmethod + def from_registered_name(registered_name: str) -> '_PacketDataType': + return NAME_TO_TYPE[registered_name] + + +NAME_TO_TYPE: Mapping[str, '_PacketDataType'] = { + 'string': + _PacketDataType.STRING, + 'bool': + _PacketDataType.BOOL, + 'int': + _PacketDataType.INT, + 'float': + _PacketDataType.FLOAT, + '::mediapipe::Matrix': + _PacketDataType.AUDIO, + '::mediapipe::ImageFrame': + _PacketDataType.IMAGE, + '::mediapipe::Classification': + _PacketDataType.PROTO, + '::mediapipe::ClassificationList': + _PacketDataType.PROTO, + '::mediapipe::Detection': + _PacketDataType.PROTO, + '::mediapipe::DetectionList': + _PacketDataType.PROTO, + '::mediapipe::Landmark': + _PacketDataType.PROTO, + '::mediapipe::NormalizedLandmark': + _PacketDataType.PROTO, + '::mediapipe::Rect': + _PacketDataType.PROTO, + '::mediapipe::NormalizedRect': + _PacketDataType.PROTO, + '::mediapipe::NormalizedLandmarkList': + _PacketDataType.PROTO, + '::std::vector<::mediapipe::Classification>': + _PacketDataType.PROTO_LIST, + '::std::vector<::mediapipe::ClassificationList>': + _PacketDataType.PROTO_LIST, + '::std::vector<::mediapipe::Detection>': + _PacketDataType.PROTO_LIST, + '::std::vector<::mediapipe::DetectionList>': + _PacketDataType.PROTO_LIST, + '::std::vector<::mediapipe::Landmark>': + _PacketDataType.PROTO_LIST, + '::std::vector<::mediapipe::NormalizedLandmark>': + _PacketDataType.PROTO_LIST, + '::std::vector<::mediapipe::NormalizedLandmarkList>': + _PacketDataType.PROTO_LIST, + '::std::vector<::mediapipe::Rect>': + _PacketDataType.PROTO_LIST, + '::std::vector<::mediapipe::NormalizedRect>': + _PacketDataType.PROTO_LIST, +} + + +class SolutionBase: + """The common base class for the high-level MediaPipe Solution APIs. + + The SolutionBase class contains the shared logic among the high-level solution + APIs including graph initialization, processing image/audio data, and graph + shutdown. + + Example usage: + hand_tracker = solution_base.SolutionBase( + binary_graph_path='mediapipe/modules/hand_landmark/hand_landmark_tracking_cpu.binarypb', + side_inputs={'num_hands': 2}) + # Read an image and convert the BGR image to RGB. + input_image = cv2.cvtColor(cv2.imread('/tmp/hand.png'), COLOR_BGR2RGB) + results = hand_tracker.process(input_image) + print(results.palm_detections) + print(results.multi_hand_landmarks) + hand_tracker.close() + """ + + def __init__( + self, + binary_graph_path: Optional[str] = None, + graph_config: Optional[calculator_pb2.CalculatorGraphConfig] = None, + calculator_params: Optional[Mapping[str, Any]] = None, + side_inputs: Optional[Mapping[str, Any]] = None, + outputs: Optional[List[str]] = None): + """Initializes the SolutionBase object. + + Args: + binary_graph_path: The path to a binary mediapipe graph file (.binarypb). + graph_config: A CalculatorGraphConfig proto message or its text proto + format. + calculator_params: A mapping from the + {calculator_name}.{options_field_name} str to the field value. + side_inputs: A mapping from the side packet name to the packet raw data. + outputs: A list of the graph output stream names to observe. If the list + is empty, all the output streams listed in the graph config will be + automatically observed by default. + + Raises: + FileNotFoundError: If the binary graph file can't be found. + RuntimeError: If the underlying calculator graph can't be successfully + initialized or started. + ValueError: If any of the following: + a) If not exactly one of 'binary_graph_path' or 'graph_config' arguments + is provided. + b) If the graph validation process contains error. + c) If the registered type name of the streams and side packets can't be + found. + d) If the calculator options of the calculator listed in + calculator_params is not allowed to be modified. + e) If the calculator options field is a repeated field but the field + value to be set is not iterable. + """ + if bool(binary_graph_path) == bool(graph_config): + raise ValueError( + "Must provide exactly one of 'binary_graph_path' or 'graph_config'.") + # MediaPipe package root path + root_path = os.sep.join( os.path.abspath(__file__).split(os.sep)[:-3]) + resource_util.set_resource_dir(root_path) + validated_graph = validated_graph_config.ValidatedGraphConfig() + if binary_graph_path: + validated_graph.initialize( + binary_graph_path=os.path.join(root_path, binary_graph_path)) + else: + validated_graph.initialize(graph_config=graph_config) + + canonical_graph_config_proto = self._initialize_graph_interface( + validated_graph, side_inputs, outputs) + if calculator_params: + self._modify_calculator_options(canonical_graph_config_proto, + calculator_params) + self._graph = calculator_graph.CalculatorGraph( + graph_config=canonical_graph_config_proto) + self._simulated_timestamp = 0 + self._graph_outputs = {} + + def callback(stream_name: str, output_packet: packet.Packet) -> None: + self._graph_outputs[stream_name] = output_packet + + for stream_name in self._output_stream_type_info.keys(): + self._graph.observe_output_stream(stream_name, callback) + + input_side_packets = { + name: self._make_packet(self._side_input_type_info[name], data) + for name, data in (side_inputs or {}).items() + } + self._graph.start_run(input_side_packets) + + # TODO: Use "inspect.Parameter" to fetch the input argument names and + # types from "_input_stream_type_info" and then auto generate the process + # method signature by "inspect.Signature" in __init__. + def process( + self, input_data: Union[np.ndarray, Mapping[str, + np.ndarray]]) -> NamedTuple: + """Processes a set of RGB image data and output SolutionOutputs. + + Args: + input_data: Either a single numpy ndarray object representing the solo + image input of a graph or a mapping from the stream name to the image + data that represents every input streams of a graph. + + Raises: + NotImplementedError: If input_data contains non image data. + RuntimeError: If the underlying graph occurs any error. + ValueError: If the input image data is not three channel RGB. + + Returns: + A NamedTuple object that contains the output data of a graph run. + The field names in the NamedTuple object are mapping to the graph output + stream names. + + Examples: + solution = solution_base.SolutionBase(graph_config=hand_landmark_graph) + results = solution.process(cv2.imread('/tmp/hand0.png')[:, :, ::-1]) + print(results.detection) + results = solution.process( + {'video_in' : cv2.imread('/tmp/hand1.png')[:, :, ::-1]}) + print(results.hand_landmarks) + """ + self._graph_outputs.clear() + + if isinstance(input_data, np.ndarray): + if len(self._input_stream_type_info.keys()) != 1: + raise ValueError( + "Can't process single image input since the graph has more than one input streams." + ) + input_dict = {next(iter(self._input_stream_type_info)): input_data} + else: + input_dict = input_data + + # Set the timestamp increment to 33333 us to simulate the 30 fps video + # input. + self._simulated_timestamp += 33333 + for stream_name, data in input_dict.items(): + if self._input_stream_type_info[stream_name] == _PacketDataType.IMAGE: + if data.shape[2] != RGB_CHANNELS: + raise ValueError('Input image must contain three channel rgb data.') + self._graph.add_packet_to_input_stream( + stream=stream_name, + packet=self._make_packet(_PacketDataType.IMAGE, + data).at(self._simulated_timestamp)) + else: + # TODO: Support audio data. + raise NotImplementedError( + f'SolutionBase can only process image data. ' + f'{self._input_stream_type_info[stream_name].name} ' + f'type is not supported yet.') + + self._graph.wait_until_idle() + # Create a NamedTuple object where the field names are mapping to the graph + # output stream names. + solution_outputs = collections.namedtuple( + 'SolutionOutputs', self._output_stream_type_info.keys()) + for stream_name in self._output_stream_type_info.keys(): + if stream_name in self._graph_outputs: + setattr( + solution_outputs, stream_name, + self._get_packet_content(self._output_stream_type_info[stream_name], + self._graph_outputs[stream_name])) + else: + setattr(solution_outputs, stream_name, None) + + return solution_outputs + + def close(self) -> None: + """Closes all the input sources and the graph.""" + self._graph.close() + self._graph = None + self._input_stream_type_info = None + self._output_stream_type_info = None + + def _initialize_graph_interface( + self, + validated_graph: validated_graph_config.ValidatedGraphConfig, + side_inputs: Optional[Mapping[str, Any]] = None, + outputs: Optional[List[str]] = None): + """Gets graph interface type information and returns the canonical graph config proto.""" + + canonical_graph_config_proto = calculator_pb2.CalculatorGraphConfig() + canonical_graph_config_proto.ParseFromString(validated_graph.binary_config) + + # Gets name from a 'TAG:index:name' str. + def get_name(tag_index_name): + return tag_index_name.split(':')[-1] + + # Gets the packet type information of the input streams and output streams + # from the validated calculator graph. The mappings from the stream names to + # the packet data types is for deciding which packet creator and getter + # methods to call in the process() method. + def get_stream_packet_type(packet_tag_index_name): + return _PacketDataType.from_registered_name( + validated_graph.registered_stream_type_name( + get_name(packet_tag_index_name))) + + self._input_stream_type_info = { + get_name(tag_index_name): get_stream_packet_type(tag_index_name) + for tag_index_name in canonical_graph_config_proto.input_stream + } + + if not outputs: + output_streams = canonical_graph_config_proto.output_stream + else: + output_streams = outputs + self._output_stream_type_info = { + get_name(tag_index_name): get_stream_packet_type(tag_index_name) + for tag_index_name in output_streams + } + + # Gets the packet type information of the input side packets from the + # validated calculator graph. The mappings from the side packet names to the + # packet data types is for making the input_side_packets dict for graph + # start_run(). + def get_side_packet_type(packet_tag_index_name): + return _PacketDataType.from_registered_name( + validated_graph.registered_side_packet_type_name( + get_name(packet_tag_index_name))) + + self._side_input_type_info = { + get_name(tag_index_name): get_side_packet_type(tag_index_name) + for tag_index_name, _ in (side_inputs or {}).items() + } + return canonical_graph_config_proto + + def _modify_calculator_options( + self, calculator_graph_config: calculator_pb2.CalculatorGraphConfig, + calculator_params: Mapping[str, Any]) -> None: + """Modifies the CalculatorOptions of the calculators listed in calculator_params.""" + + # Reorganizes the calculator options field data by calculator name and puts + # all the field data of the same calculator in a list. + def generate_nested_calculator_params(flat_map): + nested_map = {} + for compound_name, field_value in flat_map.items(): + calculator_and_field_name = compound_name.split('.') + if len(calculator_and_field_name) != 2: + raise ValueError( + f'The key "{compound_name}" in the calculator_params is invalid.') + calculator_name = calculator_and_field_name[0] + field_name = calculator_and_field_name[1] + if calculator_name in nested_map: + nested_map[calculator_name].append((field_name, field_value)) + else: + nested_map[calculator_name] = [(field_name, field_value)] + return nested_map + + def modify_options_fields(calculator_options, options_field_list): + for field_name, field_value in options_field_list: + if field_value is None: + calculator_options.ClearField(field_name) + else: + field_label = calculator_options.DESCRIPTOR.fields_by_name[ + field_name].label + if field_label is descriptor.FieldDescriptor.LABEL_REPEATED: + if not isinstance(field_value, Iterable): + raise ValueError( + f'{field_name} is a repeated proto field but the value ' + f'to be set is {type(field_value)}, which is not iterable.') + # TODO: Support resetting the entire repeated field + # (array-option) and changing the individual values in the repeated + # field (array-element-option). + calculator_options.ClearField(field_name) + for elem in field_value: + getattr(calculator_options, field_name).append(elem) + else: + setattr(calculator_options, field_name, field_value) + + nested_calculator_params = generate_nested_calculator_params( + calculator_params) + + num_modified = 0 + for node in calculator_graph_config.node: + if node.name not in nested_calculator_params: + continue + options_type = CALCULATOR_TO_OPTIONS.get(node.calculator) + if options_type is None: + raise ValueError( + f'Modifying the calculator options of {node.name} is not supported.' + ) + options_field_list = nested_calculator_params[node.name] + if node.HasField('options') and node.node_options: + raise ValueError( + f'Cannot modify the calculator options of {node.name} because it ' + f'has both options and node_options fields.') + if node.node_options: + # The "node_options" case for the proto3 syntax. + node_options_modified = False + for elem in node.node_options: + type_name = elem.type_url.split('/')[-1] + if type_name == options_type.DESCRIPTOR.full_name: + calculator_options = options_type.FromString(elem.value) + modify_options_fields(calculator_options, options_field_list) + elem.value = calculator_options.SerializeToString() + node_options_modified = True + break + # There is no existing node_options being modified. Add a new + # node_options instead. + if not node_options_modified: + calculator_options = options_type() + modify_options_fields(calculator_options, options_field_list) + node.node_options.add().Pack(calculator_options) + else: + # The "options" case for the proto2 syntax as well as the fallback + # when the calculator doesn't have either "options" or "node_options". + modify_options_fields(node.options.Extensions[options_type.ext], + options_field_list) + + num_modified += 1 + # Exits the loop early when every elements in nested_calculator_params + # have been visited. + if num_modified == len(nested_calculator_params): + break + + def _make_packet(self, packet_data_type: _PacketDataType, + data: Any) -> packet.Packet: + if packet_data_type == _PacketDataType.IMAGE: + return packet_creator.create_image_frame( + data, image_format=image_frame.ImageFormat.SRGB) + else: + return getattr(packet_creator, 'create_' + packet_data_type.value)(data) + + def _get_packet_content(self, packet_data_type: _PacketDataType, + output_packet: packet.Packet) -> Any: + if packet_data_type == _PacketDataType.STRING: + return packet_getter.get_str(output_packet) + elif packet_data_type == _PacketDataType.IMAGE: + return packet_getter.get_image_frame(output_packet).numpy_view() + else: + return getattr(packet_getter, 'get_' + packet_data_type.value)( + output_packet) diff --git a/mediapipe/python/solution_base_test.py b/mediapipe/python/solution_base_test.py new file mode 100644 index 0000000000..e3e597c126 --- /dev/null +++ b/mediapipe/python/solution_base_test.py @@ -0,0 +1,288 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Tests for mediapipe.python.solution_base.""" + +from absl.testing import absltest +from absl.testing import parameterized +import numpy as np + +from google.protobuf import text_format +from mediapipe.framework import calculator_pb2 +from mediapipe.framework.formats import detection_pb2 +from mediapipe.python import solution_base + +CALCULATOR_OPTIONS_TEST_GRAPH_CONFIG = """ + input_stream: 'image_in' + output_stream: 'image_out' + node { + name: 'ImageTransformation' + calculator: 'ImageTransformationCalculator' + input_stream: 'IMAGE:image_in' + output_stream: 'IMAGE:image_out' + options: { + [mediapipe.ImageTransformationCalculatorOptions.ext] { + output_width: 10 + output_height: 10 + } + } + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 10 + output_height: 10 + } + } + } +""" + + +class SolutionBaseTest(parameterized.TestCase): + + def test_invalid_initialization_arguments(self): + with self.assertRaisesRegex( + ValueError, + 'Must provide exactly one of \'binary_graph_path\' or \'graph_config\'.' + ): + solution_base.SolutionBase() + with self.assertRaisesRegex( + ValueError, + 'Must provide exactly one of \'binary_graph_path\' or \'graph_config\'.' + ): + solution_base.SolutionBase( + graph_config=calculator_pb2.CalculatorGraphConfig(), + binary_graph_path='/tmp/no_such.binarypb') + + @parameterized.named_parameters(('no_graph_input_output_stream', """ + node { + calculator: 'PassThroughCalculator' + input_stream: 'in' + output_stream: 'out' + } + """, RuntimeError, 'does not have a corresponding output stream.'), + ('calcualtor_io_mismatch', """ + node { + calculator: 'PassThroughCalculator' + input_stream: 'in' + input_stream: 'in2' + output_stream: 'out' + } + """, ValueError, 'must use matching tags and indexes.'), + ('unkown_registered_stream_type_name', """ + input_stream: 'in' + output_stream: 'out' + node { + calculator: 'PassThroughCalculator' + input_stream: 'in' + output_stream: 'out' + } + """, RuntimeError, 'Unable to find the type for stream \"in\".')) + def test_invalid_config(self, text_config, error_type, error_message): + config_proto = text_format.Parse(text_config, + calculator_pb2.CalculatorGraphConfig()) + with self.assertRaisesRegex(error_type, error_message): + solution_base.SolutionBase(graph_config=config_proto) + + def test_invalid_input_data_type(self): + text_config = """ + input_stream: 'input_detections' + output_stream: 'output_detections' + node { + calculator: 'DetectionUniqueIdCalculator' + input_stream: 'DETECTIONS:input_detections' + output_stream: 'DETECTIONS:output_detections' + } + """ + config_proto = text_format.Parse(text_config, + calculator_pb2.CalculatorGraphConfig()) + solution = solution_base.SolutionBase(graph_config=config_proto) + detection = detection_pb2.Detection() + text_format.Parse('score: 0.5', detection) + with self.assertRaisesRegex( + NotImplementedError, + 'SolutionBase can only process image data. PROTO_LIST type is not supported.' + ): + solution.process({'input_detections': detection}) + + def test_invalid_input_image_data(self): + text_config = """ + input_stream: 'image_in' + output_stream: 'image_out' + node { + calculator: 'ImageTransformationCalculator' + input_stream: 'IMAGE:image_in' + output_stream: 'IMAGE:transformed_image_in' + } + node { + calculator: 'ImageTransformationCalculator' + input_stream: 'IMAGE:transformed_image_in' + output_stream: 'IMAGE:image_out' + } + """ + config_proto = text_format.Parse(text_config, + calculator_pb2.CalculatorGraphConfig()) + solution = solution_base.SolutionBase(graph_config=config_proto) + with self.assertRaisesRegex( + ValueError, 'Input image must contain three channel rgb data.'): + solution.process(np.arange(36, dtype=np.uint8).reshape(3, 3, 4)) + + @parameterized.named_parameters(('graph_without_side_packets', """ + input_stream: 'image_in' + output_stream: 'image_out' + node { + calculator: 'ImageTransformationCalculator' + input_stream: 'IMAGE:image_in' + output_stream: 'IMAGE:transformed_image_in' + } + node { + calculator: 'ImageTransformationCalculator' + input_stream: 'IMAGE:transformed_image_in' + output_stream: 'IMAGE:image_out' + } + """, None), ('graph_with_side_packets', """ + input_stream: 'image_in' + input_side_packet: 'allow_signal' + input_side_packet: 'rotation_degrees' + output_stream: 'image_out' + node { + calculator: 'ImageTransformationCalculator' + input_stream: 'IMAGE:image_in' + input_side_packet: 'ROTATION_DEGREES:rotation_degrees' + output_stream: 'IMAGE:transformed_image_in' + } + node { + calculator: 'GateCalculator' + input_stream: 'transformed_image_in' + input_side_packet: 'ALLOW:allow_signal' + output_stream: 'image_out_to_transform' + } + node { + calculator: 'ImageTransformationCalculator' + input_stream: 'IMAGE:image_out_to_transform' + input_side_packet: 'ROTATION_DEGREES:rotation_degrees' + output_stream: 'IMAGE:image_out' + }""", { + 'allow_signal': True, + 'rotation_degrees': 0 + })) + def test_solution_process(self, text_config, side_inputs): + self._process_and_verify( + config_proto=text_format.Parse(text_config, + calculator_pb2.CalculatorGraphConfig()), + side_inputs=side_inputs) + + def test_invalid_calculator_options(self): + text_config = """ + input_stream: 'image_in' + output_stream: 'image_out' + node { + calculator: 'ImageTransformationCalculator' + input_stream: 'IMAGE:image_in' + output_stream: 'IMAGE:transformed_image_in' + } + node { + name: 'SignalGate' + calculator: 'GateCalculator' + input_stream: 'transformed_image_in' + input_side_packet: 'ALLOW:allow_signal' + output_stream: 'image_out_to_transform' + } + node { + calculator: 'ImageTransformationCalculator' + input_stream: 'IMAGE:image_out_to_transform' + output_stream: 'IMAGE:image_out' + } + """ + config_proto = text_format.Parse(text_config, + calculator_pb2.CalculatorGraphConfig()) + with self.assertRaisesRegex( + ValueError, + 'Modifying the calculator options of SignalGate is not supported.'): + solution_base.SolutionBase( + graph_config=config_proto, + calculator_params={'SignalGate.invalid_field': 'I am invalid'}) + + def test_calculator_has_both_options_and_node_options(self): + config_proto = text_format.Parse(CALCULATOR_OPTIONS_TEST_GRAPH_CONFIG, + calculator_pb2.CalculatorGraphConfig()) + with self.assertRaisesRegex(ValueError, + 'has both options and node_options fields.'): + solution_base.SolutionBase( + graph_config=config_proto, + calculator_params={ + 'ImageTransformation.output_width': 0, + 'ImageTransformation.output_height': 0 + }) + + def test_modifying_calculator_proto2_options(self): + config_proto = text_format.Parse(CALCULATOR_OPTIONS_TEST_GRAPH_CONFIG, + calculator_pb2.CalculatorGraphConfig()) + # To test proto2 options only, remove the proto3 node_options field from the + # graph config. + self.assertEqual('ImageTransformation', config_proto.node[0].name) + config_proto.node[0].ClearField('node_options') + self._process_and_verify( + config_proto=config_proto, + calculator_params={ + 'ImageTransformation.output_width': 0, + 'ImageTransformation.output_height': 0 + }) + + def test_modifying_calculator_proto3_node_options(self): + config_proto = text_format.Parse(CALCULATOR_OPTIONS_TEST_GRAPH_CONFIG, + calculator_pb2.CalculatorGraphConfig()) + # To test proto3 node options only, remove the proto2 options field from the + # graph config. + self.assertEqual('ImageTransformation', config_proto.node[0].name) + config_proto.node[0].ClearField('options') + self._process_and_verify( + config_proto=config_proto, + calculator_params={ + 'ImageTransformation.output_width': 0, + 'ImageTransformation.output_height': 0 + }) + + def test_adding_calculator_options(self): + config_proto = text_format.Parse(CALCULATOR_OPTIONS_TEST_GRAPH_CONFIG, + calculator_pb2.CalculatorGraphConfig()) + # To test a calculator with no options field, remove both proto2 options and + # proto3 node_options fields from the graph config. + self.assertEqual('ImageTransformation', config_proto.node[0].name) + config_proto.node[0].ClearField('options') + config_proto.node[0].ClearField('node_options') + self._process_and_verify( + config_proto=config_proto, + calculator_params={ + 'ImageTransformation.output_width': 0, + 'ImageTransformation.output_height': 0 + }) + + def _process_and_verify(self, + config_proto, + side_inputs=None, + calculator_params=None): + input_image = np.arange(27, dtype=np.uint8).reshape(3, 3, 3) + solution = solution_base.SolutionBase( + graph_config=config_proto, + side_inputs=side_inputs, + calculator_params=calculator_params) + outputs = solution.process(input_image) + self.assertTrue(np.array_equal(input_image, outputs.image_out)) + outputs2 = solution.process({'image_in': input_image}) + self.assertTrue(np.array_equal(input_image, outputs2.image_out)) + solution.close() + + +if __name__ == '__main__': + absltest.main() diff --git a/mediapipe/python/solutions/__init__.py b/mediapipe/python/solutions/__init__.py new file mode 100644 index 0000000000..bf50d9e433 --- /dev/null +++ b/mediapipe/python/solutions/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MediaPipe Solutions Python API.""" + +import mediapipe.python.solutions.drawing_utils +import mediapipe.python.solutions.face_mesh +import mediapipe.python.solutions.hands +import mediapipe.python.solutions.pose diff --git a/mediapipe/python/solutions/drawing_utils.py b/mediapipe/python/solutions/drawing_utils.py new file mode 100644 index 0000000000..47fdd8419a --- /dev/null +++ b/mediapipe/python/solutions/drawing_utils.py @@ -0,0 +1,114 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""MediaPipe solution drawing utils.""" + +import math +from typing import List, Tuple, Union + +import cv2 +import dataclasses +import numpy as np + +from mediapipe.framework.formats import landmark_pb2 + +RGB_CHANNELS = 3 +RED_COLOR = (0, 0, 255) + + +@dataclasses.dataclass +class DrawingSpec: + # Color for drawing the annotation. Default to the green color. + color: Tuple[int, int, int] = (0, 255, 0) + # Thickness for drawing the annotation. Default to 2 pixels. + thickness: int = 2 + # Circle radius. Default to 2 pixels. + circle_radius: int = 2 + + +def _normalized_to_pixel_coordinates( + normalized_x: float, normalized_y: float, image_width: int, + image_height: int) -> Union[None, Tuple[int, int]]: + """Converts normalized value pair to pixel coordinates.""" + + # Checks if the float value is between 0 and 1. + def is_valid_normalized_value(value: float) -> bool: + return (value > 0 or math.isclose(0, value)) and (value < 1 or + math.isclose(1, value)) + + if not (is_valid_normalized_value(normalized_x) and + is_valid_normalized_value(normalized_y)): + # TODO: Draw coordinates even if it's outside of the image bounds. + return None + x_px = min(math.floor(normalized_x * image_width), image_width - 1) + y_px = min(math.floor(normalized_y * image_height), image_height - 1) + return x_px, y_px + + +def draw_landmarks( + image: np.ndarray, + landmark_list: landmark_pb2.NormalizedLandmarkList, + connections: List[Tuple[int, int]] = None, + landmark_drawing_spec: DrawingSpec = DrawingSpec(color=RED_COLOR), + connection_drawing_spec: DrawingSpec = DrawingSpec()): + """Draws the landmarks and the connections on the image. + + Args: + image: A three channel RGB image represented as numpy ndarray. + landmark_list: A normalized landmark list proto message to be annotated on + the image. + connections: A list of landmark index tuples that specifies how landmarks to + be connected in the drawing. + landmark_drawing_spec: A DrawingSpec object that specifies the landmarks' + drawing settings such as color, line thickness, and circle radius. + connection_drawing_spec: A DrawingSpec object that specifies the + connections' drawing settings such as color and line thickness. + + Raises: + ValueError: If one of the followings: + a) If the input image is not three channel RGB. + b) If any connetions contain invalid landmark index. + """ + if not landmark_list: + return + if image.shape[2] != RGB_CHANNELS: + raise ValueError('Input image must contain three channel rgb data.') + image_rows, image_cols, _ = image.shape + idx_to_coordinates = {} + for idx, landmark in enumerate(landmark_list.landmark): + if landmark.visibility < 0 or landmark.presence < 0: + continue + landmark_px = _normalized_to_pixel_coordinates(landmark.x, landmark.y, + image_cols, image_rows) + if landmark_px: + idx_to_coordinates[idx] = landmark_px + if connections: + num_landmarks = len(landmark_list.landmark) + # Draws the connections if the start and end landmarks are both visible. + for connection in connections: + start_idx = connection[0] + end_idx = connection[1] + if not (0 <= start_idx < num_landmarks and 0 <= end_idx < num_landmarks): + raise ValueError(f'Landmark index is out of range. Invalid connection ' + f'from landmark #{start_idx} to landmark #{end_idx}.') + if start_idx in idx_to_coordinates and end_idx in idx_to_coordinates: + cv2.line(image, idx_to_coordinates[start_idx], + idx_to_coordinates[end_idx], connection_drawing_spec.color, + connection_drawing_spec.thickness) + # Draws landmark points after finishing the connection lines, which is + # aesthetically better. + for landmark_px in idx_to_coordinates.values(): + cv2.circle(image, landmark_px, landmark_drawing_spec.circle_radius, + landmark_drawing_spec.color, landmark_drawing_spec.thickness) diff --git a/mediapipe/python/solutions/drawing_utils_test.py b/mediapipe/python/solutions/drawing_utils_test.py new file mode 100644 index 0000000000..2241d58730 --- /dev/null +++ b/mediapipe/python/solutions/drawing_utils_test.py @@ -0,0 +1,144 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Tests for mediapipe.python.solutions.drawing_utils.""" + +from absl.testing import absltest +from absl.testing import parameterized +import cv2 +import numpy as np + +from google.protobuf import text_format + +from mediapipe.framework.formats import landmark_pb2 +from mediapipe.python.solutions import drawing_utils + +DEFAULT_CONNECTION_DRAWING_SPEC = drawing_utils.DrawingSpec() +DEFAULT_LANDMARK_DRAWING_SPEC = drawing_utils.DrawingSpec(color=(0, 0, 255)) + + +class DrawingUtilTest(parameterized.TestCase): + + def test_invalid_input_image(self): + image = np.arange(18, dtype=np.uint8).reshape(3, 3, 2) + with self.assertRaisesRegex( + ValueError, 'Input image must contain three channel rgb data.'): + drawing_utils.draw_landmarks(image, landmark_pb2.NormalizedLandmarkList()) + + def test_invalid_connection(self): + landmark_list = text_format.Parse( + 'landmark {x: 0.5 y: 0.5} landmark {x: 0.2 y: 0.2}', + landmark_pb2.NormalizedLandmarkList()) + image = np.arange(27, dtype=np.uint8).reshape(3, 3, 3) + with self.assertRaisesRegex(ValueError, 'Landmark index is out of range.'): + drawing_utils.draw_landmarks(image, landmark_list, [(0, 2)]) + + @parameterized.named_parameters( + ('landmark_list_has_only_one_element', 'landmark {x: 0.1 y: 0.1}'), + ('second_landmark_is_invisible', + 'landmark {x: 0.1 y: 0.1} landmark {x: 0.5 y: 0.5 visibility: -1.0}')) + def test_draw_single_landmark_point(self, landmark_list_text): + landmark_list = text_format.Parse(landmark_list_text, + landmark_pb2.NormalizedLandmarkList()) + image = np.zeros((100, 100, 3), np.uint8) + expected_result = np.copy(image) + cv2.circle(expected_result, (10, 10), + DEFAULT_LANDMARK_DRAWING_SPEC.circle_radius, + DEFAULT_LANDMARK_DRAWING_SPEC.color, + DEFAULT_LANDMARK_DRAWING_SPEC.thickness) + drawing_utils.draw_landmarks(image, landmark_list) + np.testing.assert_array_equal(image, expected_result) + + @parameterized.named_parameters( + ('landmarks_have_x_and_y_only', + 'landmark {x: 0.1 y: 0.5} landmark {x: 0.5 y: 0.1}'), + ('landmark_zero_visibility_and_presence', + 'landmark {x: 0.1 y: 0.5 presence: 0.0}' + 'landmark {x: 0.5 y: 0.1 visibility: 0.0}')) + def test_draw_landmarks_and_connections(self, landmark_list_text): + landmark_list = text_format.Parse(landmark_list_text, + landmark_pb2.NormalizedLandmarkList()) + image = np.zeros((100, 100, 3), np.uint8) + expected_result = np.copy(image) + start_point = (10, 50) + end_point = (50, 10) + cv2.line(expected_result, start_point, end_point, + DEFAULT_CONNECTION_DRAWING_SPEC.color, + DEFAULT_CONNECTION_DRAWING_SPEC.thickness) + cv2.circle(expected_result, start_point, + DEFAULT_LANDMARK_DRAWING_SPEC.circle_radius, + DEFAULT_LANDMARK_DRAWING_SPEC.color, + DEFAULT_LANDMARK_DRAWING_SPEC.thickness) + cv2.circle(expected_result, end_point, + DEFAULT_LANDMARK_DRAWING_SPEC.circle_radius, + DEFAULT_LANDMARK_DRAWING_SPEC.color, + DEFAULT_LANDMARK_DRAWING_SPEC.thickness) + drawing_utils.draw_landmarks( + image=image, landmark_list=landmark_list, connections=[(0, 1)]) + np.testing.assert_array_equal(image, expected_result) + + def test_min_and_max_coordinate_values(self): + landmark_list = text_format.Parse( + 'landmark {x: 0.0 y: 1.0}' + 'landmark {x: 1.0 y: 0.0}', landmark_pb2.NormalizedLandmarkList()) + image = np.zeros((100, 100, 3), np.uint8) + expected_result = np.copy(image) + start_point = (0, 99) + end_point = (99, 0) + cv2.line(expected_result, start_point, end_point, + DEFAULT_CONNECTION_DRAWING_SPEC.color, + DEFAULT_CONNECTION_DRAWING_SPEC.thickness) + cv2.circle(expected_result, start_point, + DEFAULT_LANDMARK_DRAWING_SPEC.circle_radius, + DEFAULT_LANDMARK_DRAWING_SPEC.color, + DEFAULT_LANDMARK_DRAWING_SPEC.thickness) + cv2.circle(expected_result, end_point, + DEFAULT_LANDMARK_DRAWING_SPEC.circle_radius, + DEFAULT_LANDMARK_DRAWING_SPEC.color, + DEFAULT_LANDMARK_DRAWING_SPEC.thickness) + drawing_utils.draw_landmarks( + image=image, landmark_list=landmark_list, connections=[(0, 1)]) + np.testing.assert_array_equal(image, expected_result) + + def test_drawing_spec(self): + landmark_list = text_format.Parse( + 'landmark {x: 0.1 y: 0.1}' + 'landmark {x: 0.8 y: 0.8}', landmark_pb2.NormalizedLandmarkList()) + image = np.zeros((100, 100, 3), np.uint8) + landmark_drawing_spec = drawing_utils.DrawingSpec( + color=(0, 0, 255), thickness=5) + connection_drawing_spec = drawing_utils.DrawingSpec( + color=(255, 0, 0), thickness=3) + expected_result = np.copy(image) + start_point = (10, 10) + end_point = (80, 80) + cv2.line(expected_result, start_point, end_point, + connection_drawing_spec.color, connection_drawing_spec.thickness) + cv2.circle(expected_result, start_point, + landmark_drawing_spec.circle_radius, landmark_drawing_spec.color, + landmark_drawing_spec.thickness) + cv2.circle(expected_result, end_point, landmark_drawing_spec.circle_radius, + landmark_drawing_spec.color, landmark_drawing_spec.thickness) + drawing_utils.draw_landmarks( + image=image, + landmark_list=landmark_list, + connections=[(0, 1)], + landmark_drawing_spec=landmark_drawing_spec, + connection_drawing_spec=connection_drawing_spec) + np.testing.assert_array_equal(image, expected_result) + + +if __name__ == '__main__': + absltest.main() diff --git a/mediapipe/python/solutions/face_mesh.py b/mediapipe/python/solutions/face_mesh.py new file mode 100644 index 0000000000..2afcbfdf3c --- /dev/null +++ b/mediapipe/python/solutions/face_mesh.py @@ -0,0 +1,307 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""MediaPipe FaceMesh.""" + +from typing import NamedTuple + +import numpy as np + +# pylint: disable=unused-import +from mediapipe.calculators.core import gate_calculator_pb2 +from mediapipe.calculators.core import split_vector_calculator_pb2 +from mediapipe.calculators.tensor import inference_calculator_pb2 +from mediapipe.calculators.tensor import tensors_to_classification_calculator_pb2 +from mediapipe.calculators.tensor import tensors_to_detections_calculator_pb2 +from mediapipe.calculators.tensor import tensors_to_landmarks_calculator_pb2 +from mediapipe.calculators.tflite import ssd_anchors_calculator_pb2 +from mediapipe.calculators.util import association_calculator_pb2 +from mediapipe.calculators.util import detections_to_rects_calculator_pb2 +from mediapipe.calculators.util import logic_calculator_pb2 +from mediapipe.calculators.util import non_max_suppression_calculator_pb2 +from mediapipe.calculators.util import rect_transformation_calculator_pb2 +from mediapipe.calculators.util import thresholding_calculator_pb2 +# pylint: enable=unused-import +from mediapipe.python.solution_base import SolutionBase + +BINARYPB_FILE_PATH = 'mediapipe/modules/face_landmark/face_landmark_front_cpu.binarypb' +FACE_CONNECTIONS = frozenset([ + # Lips. + (61, 146), + (146, 91), + (91, 181), + (181, 84), + (84, 17), + (17, 314), + (314, 405), + (405, 321), + (321, 375), + (375, 291), + (61, 185), + (185, 40), + (40, 39), + (39, 37), + (37, 0), + (0, 267), + (267, 269), + (269, 270), + (270, 409), + (409, 291), + (78, 95), + (95, 88), + (88, 178), + (178, 87), + (87, 14), + (14, 317), + (317, 402), + (402, 318), + (318, 324), + (324, 308), + (78, 191), + (191, 80), + (80, 81), + (81, 82), + (82, 13), + (13, 312), + (312, 311), + (311, 310), + (310, 415), + (415, 308), + # Left eye. + (33, 7), + (7, 163), + (163, 144), + (144, 145), + (145, 153), + (153, 154), + (154, 155), + (155, 133), + (33, 246), + (246, 161), + (161, 160), + (160, 159), + (159, 158), + (158, 157), + (157, 173), + (173, 133), + # Left eyebrow. + (46, 53), + (53, 52), + (52, 65), + (65, 55), + (70, 63), + (63, 105), + (105, 66), + (66, 107), + # Right eye. + (263, 249), + (249, 390), + (390, 373), + (373, 374), + (374, 380), + (380, 381), + (381, 382), + (382, 362), + (263, 466), + (466, 388), + (388, 387), + (387, 386), + (386, 385), + (385, 384), + (384, 398), + (398, 362), + # Right eyebrow. + (276, 283), + (283, 282), + (282, 295), + (295, 285), + (300, 293), + (293, 334), + (334, 296), + (296, 336), + # Face oval. + (10, 338), + (338, 297), + (297, 332), + (332, 284), + (284, 251), + (251, 389), + (389, 356), + (356, 454), + (454, 323), + (323, 361), + (361, 288), + (288, 397), + (397, 365), + (365, 379), + (379, 378), + (378, 400), + (400, 377), + (377, 152), + (152, 148), + (148, 176), + (176, 149), + (149, 150), + (150, 136), + (136, 172), + (172, 58), + (58, 132), + (132, 93), + (93, 234), + (234, 127), + (127, 162), + (162, 21), + (21, 54), + (54, 103), + (103, 67), + (67, 109), + (109, 10) +]) + + +class FaceMesh(SolutionBase): + """MediaPipe FaceMesh. + + MediaPipe FaceMesh processes an RGB image and returns the face landmarks on + each detected face. + + Usage examples: + import cv2 + import mediapipe as mp + mp_drawing = mp.solutions.drawing_utils + mp_face_mesh = mp.solutions.face_mesh + + # For static images: + face_mesh = mp_face_mesh.FaceMesh( + static_image_mode=True, + max_num_faces=1, + min_detection_confidence=0.5) + drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1) + for idx, file in enumerate(file_list): + image = cv2.imread(file) + # Convert the BGR image to RGB before processing. + results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + + # Print and draw face mesh landmarks on the image. + if not results.multi_face_landmarks: + continue + annotated_image = image.copy() + for face_landmarks in results.multi_face_landmarks: + print('face_landmarks:', face_landmarks) + mp_drawing.draw_landmarks( + image=annotated_image, + landmark_list=face_landmarks, + connections=mp_face_mesh.FACE_CONNECTIONS, + landmark_drawing_spec=drawing_spec, + connection_drawing_spec=drawing_spec) + cv2.imwrite('/tmp/annotated_image' + str(idx) + '.png', image) + face_mesh.close() + + # For webcam input: + face_mesh = mp_face_mesh.FaceMesh( + min_detection_confidence=0.5, min_tracking_confidence=0.5) + drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1) + cap = cv2.VideoCapture(0) + while cap.isOpened(): + success, image = cap.read() + if not success: + break + + # Flip the image horizontally for a later selfie-view display, and convert + # the BGR image to RGB. + image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB) + # To improve performance, optionally mark the image as not writeable to + # pass by reference. + image.flags.writeable = False + results = face_mesh.process(image) + + # Draw the face mesh annotations on the image. + image.flags.writeable = True + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + if results.multi_face_landmarks: + for face_landmarks in results.multi_face_landmarks: + mp_drawing.draw_landmarks( + image=image, + landmark_list=face_landmarks, + connections=mp_face_mesh.FACE_CONNECTIONS, + landmark_drawing_spec=drawing_spec, + connection_drawing_spec=drawing_spec) + cv2.imshow('MediaPipe FaceMesh', image) + if cv2.waitKey(5) & 0xFF == 27: + break + face_mesh.close() + cap.release() + """ + + def __init__(self, + static_image_mode=False, + max_num_faces=2, + min_detection_confidence=0.5, + min_tracking_confidence=0.5): + """Initializes a MediaPipe FaceMesh object. + + Args: + static_image_mode: If set to False, the solution treats the input images + as a video stream. It will try to detect faces in the first input + images, and upon a successful detection further localizes the face + landmarks. In subsequent images, once all "max_num_faces" faces are + detected and the corresponding face landmarks are localized, it simply + tracks those landmarks without invoking another detection until it loses + track of any of the faces. This reduces latency and is ideal for + processing video frames. If set to True, face detection runs on every + input image, ideal for processing a batch of static, possibly unrelated, + images. Default to False. + max_num_faces: Maximum number of faces to detect. Default to 2. + min_detection_confidence: Minimum confidence value ([0.0, 1.0]) from the + face detection model for the detection to be considered successful. + Default to 0.5. + min_tracking_confidence: Minimum confidence value ([0.0, 1.0]) from the + landmark-tracking model for the face landmarks to be considered tracked + successfully, or otherwise face detection will be invoked automatically + on the next input image. Setting it to a higher value can increase + robustness of the solution, at the expense of a higher latency. Ignored + if "static_image_mode" is True, where face detection simply runs on + every image. Default to 0.5. + """ + super().__init__( + binary_graph_path=BINARYPB_FILE_PATH, + side_inputs={ + 'num_faces': max_num_faces, + 'can_skip_detection': not static_image_mode, + }, + calculator_params={ + 'facedetectionfrontcpu__TensorsToDetectionsCalculator.min_score_thresh': + min_detection_confidence, + 'facelandmarkcpu__ThresholdingCalculator.threshold': + min_tracking_confidence, + }, + outputs=['multi_face_landmarks']) + + def process(self, image: np.ndarray) -> NamedTuple: + """Processes an RGB image and returns the face landmarks on each detected face. + + Args: + image: An RGB image represented as a numpy ndarray. + + Raises: + RuntimeError: If the underlying graph occurs any error. + ValueError: If the input image is not three channel RGB. + + Returns: + A NamedTuple object with a "multi_face_landmarks" field that contains the + face landmarks on each detected face. + """ + + return super().process(input_data={'image': image}) diff --git a/mediapipe/python/solutions/hands.py b/mediapipe/python/solutions/hands.py new file mode 100644 index 0000000000..8253e344cd --- /dev/null +++ b/mediapipe/python/solutions/hands.py @@ -0,0 +1,228 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""MediaPipe Hands.""" + +import enum +from typing import NamedTuple + +import numpy as np + +# pylint: disable=unused-import +from mediapipe.calculators.core import gate_calculator_pb2 +from mediapipe.calculators.core import split_vector_calculator_pb2 +from mediapipe.calculators.tensor import image_to_tensor_calculator_pb2 +from mediapipe.calculators.tensor import inference_calculator_pb2 +from mediapipe.calculators.tensor import tensors_to_classification_calculator_pb2 +from mediapipe.calculators.tensor import tensors_to_detections_calculator_pb2 +from mediapipe.calculators.tensor import tensors_to_landmarks_calculator_pb2 +from mediapipe.calculators.tflite import ssd_anchors_calculator_pb2 +from mediapipe.calculators.util import association_calculator_pb2 +from mediapipe.calculators.util import detections_to_rects_calculator_pb2 +from mediapipe.calculators.util import logic_calculator_pb2 +from mediapipe.calculators.util import non_max_suppression_calculator_pb2 +from mediapipe.calculators.util import rect_transformation_calculator_pb2 +from mediapipe.calculators.util import thresholding_calculator_pb2 +# pylint: enable=unused-import +from mediapipe.python.solution_base import SolutionBase + + +class HandLandmark(enum.IntEnum): + """The 21 hand landmarks.""" + WRIST = 0 + THUMB_CMC = 1 + THUMB_MCP = 2 + THUMB_IP = 3 + THUMB_TIP = 4 + INDEX_FINGER_MCP = 5 + INDEX_FINGER_PIP = 6 + INDEX_FINGER_DIP = 7 + INDEX_FINGER_TIP = 8 + MIDDLE_FINGER_MCP = 9 + MIDDLE_FINGER_PIP = 10 + MIDDLE_FINGER_DIP = 11 + MIDDLE_FINGER_TIP = 12 + RING_FINGER_MCP = 13 + RING_FINGER_PIP = 14 + RING_FINGER_DIP = 15 + RING_FINGER_TIP = 16 + PINKY_MCP = 17 + PINKY_PIP = 18 + PINKY_DIP = 19 + PINKY_TIP = 20 + + +BINARYPB_FILE_PATH = 'mediapipe/modules/hand_landmark/hand_landmark_tracking_cpu.binarypb' +HAND_CONNECTIONS = frozenset([ + (HandLandmark.WRIST, HandLandmark.THUMB_CMC), + (HandLandmark.THUMB_CMC, HandLandmark.THUMB_MCP), + (HandLandmark.THUMB_MCP, HandLandmark.THUMB_IP), + (HandLandmark.THUMB_IP, HandLandmark.THUMB_TIP), + (HandLandmark.WRIST, HandLandmark.INDEX_FINGER_MCP), + (HandLandmark.INDEX_FINGER_MCP, HandLandmark.INDEX_FINGER_PIP), + (HandLandmark.INDEX_FINGER_PIP, HandLandmark.INDEX_FINGER_DIP), + (HandLandmark.INDEX_FINGER_DIP, HandLandmark.INDEX_FINGER_TIP), + (HandLandmark.INDEX_FINGER_MCP, HandLandmark.MIDDLE_FINGER_MCP), + (HandLandmark.MIDDLE_FINGER_MCP, HandLandmark.MIDDLE_FINGER_PIP), + (HandLandmark.MIDDLE_FINGER_PIP, HandLandmark.MIDDLE_FINGER_DIP), + (HandLandmark.MIDDLE_FINGER_DIP, HandLandmark.MIDDLE_FINGER_TIP), + (HandLandmark.MIDDLE_FINGER_MCP, HandLandmark.RING_FINGER_MCP), + (HandLandmark.RING_FINGER_MCP, HandLandmark.RING_FINGER_PIP), + (HandLandmark.RING_FINGER_PIP, HandLandmark.RING_FINGER_DIP), + (HandLandmark.RING_FINGER_DIP, HandLandmark.RING_FINGER_TIP), + (HandLandmark.RING_FINGER_MCP, HandLandmark.PINKY_MCP), + (HandLandmark.WRIST, HandLandmark.PINKY_MCP), + (HandLandmark.PINKY_MCP, HandLandmark.PINKY_PIP), + (HandLandmark.PINKY_PIP, HandLandmark.PINKY_DIP), + (HandLandmark.PINKY_DIP, HandLandmark.PINKY_TIP) +]) + + +class Hands(SolutionBase): + """MediaPipe Hands. + + MediaPipe Hands processes an RGB image and returns the hand landmarks and + handedness (left v.s. right hand) of each detected hand. + + Note that it determines handedness assuming the input image is mirrored, + i.e., taken with a front-facing/selfie camera ( + https://en.wikipedia.org/wiki/Front-facing_camera) with images flipped + horizontally. If that is not the case, use, for instance, cv2.flip(image, 1) + to flip the image first for a correct handedness output. + + Usage examples: + import cv2 + import mediapipe as mp + mp_drawing = mp.solutions.drawing_utils + mp_hands = mp.solutions.hands + + # For static images: + hands = mp_hands.Hands( + static_image_mode=True, + max_num_hands=2, + min_detection_confidence=0.7) + for idx, file in enumerate(file_list): + # Read an image, flip it around y-axis for correct handedness output (see + # above). + image = cv2.flip(cv2.imread(file), 1) + # Convert the BGR image to RGB before processing. + results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + + # Print handedness and draw hand landmarks on the image. + print('handedness:', results.multi_handedness) + if not results.multi_hand_landmarks: + continue + annotated_image = image.copy() + for hand_landmarks in results.multi_hand_landmarks: + print('hand_landmarks:', hand_landmarks) + mp_drawing.draw_landmarks( + annotated_image, hand_landmarks, mp_hands.HAND_CONNECTIONS) + cv2.imwrite( + '/tmp/annotated_image' + str(idx) + '.png', cv2.flip(image, 1)) + hands.close() + + # For webcam input: + hands = mp_hands.Hands( + min_detection_confidence=0.7, min_tracking_confidence=0.5) + cap = cv2.VideoCapture(0) + while cap.isOpened(): + success, image = cap.read() + if not success: + break + + # Flip the image horizontally for a later selfie-view display, and convert + # the BGR image to RGB. + image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB) + # To improve performance, optionally mark the image as not writeable to + # pass by reference. + image.flags.writeable = False + results = hands.process(image) + + # Draw the hand annotations on the image. + image.flags.writeable = True + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + if results.multi_hand_landmarks: + for hand_landmarks in results.multi_hand_landmarks: + mp_drawing.draw_landmarks( + image, hand_landmarks, mp_hands.HAND_CONNECTIONS) + cv2.imshow('MediaPipe Hands', image) + if cv2.waitKey(5) & 0xFF == 27: + break + hands.close() + cap.release() + """ + + def __init__(self, + static_image_mode=False, + max_num_hands=2, + min_detection_confidence=0.7, + min_tracking_confidence=0.5): + """Initializes a MediaPipe Hand object. + + Args: + static_image_mode: If set to False, the solution treats the input images + as a video stream. It will try to detect hands in the first input + images, and upon a successful detection further localizes the hand + landmarks. In subsequent images, once all "max_num_hands" hands are + detected and the corresponding hand landmarks are localized, it simply + tracks those landmarks without invoking another detection until it loses + track of any of the hands. This reduces latency and is ideal for + processing video frames. If set to True, hand detection runs on every + input image, ideal for processing a batch of static, possibly unrelated, + images. Default to False. + max_num_hands: Maximum number of hands to detect. Default to 2. + min_detection_confidence: Minimum confidence value ([0.0, 1.0]) from the + hand detection model for the detection to be considered successful. + Default to 0.7. + min_tracking_confidence: Minimum confidence value ([0.0, 1.0]) from the + landmark-tracking model for the hand landmarks to be considered tracked + successfully, or otherwise hand detection will be invoked automatically + on the next input image. Setting it to a higher value can increase + robustness of the solution, at the expense of a higher latency. Ignored + if "static_image_mode" is True, where hand detection simply runs on + every image. Default to 0.5. + """ + super().__init__( + binary_graph_path=BINARYPB_FILE_PATH, + side_inputs={ + 'num_hands': max_num_hands, + 'can_skip_detection': not static_image_mode, + }, + calculator_params={ + 'palmdetectioncpu__TensorsToDetectionsCalculator.min_score_thresh': + min_detection_confidence, + 'handlandmarkcpu__ThresholdingCalculator.threshold': + min_tracking_confidence, + }, + outputs=['multi_hand_landmarks', 'multi_handedness']) + + def process(self, image: np.ndarray) -> NamedTuple: + """Processes an RGB image and returns the hand landmarks and handedness of each detected hand. + + Args: + image: An RGB image represented as a numpy ndarray. + + Raises: + RuntimeError: If the underlying graph occurs any error. + ValueError: If the input image is not three channel RGB. + + Returns: + A NamedTuple object with two fields: a "multi_hand_landmarks" field that + contains the hand landmarks on each detected hand and a "multi_handedness" + field that contains the handedness (left v.s. right hand) of the detected + hand. + """ + + return super().process(input_data={'image': image}) diff --git a/mediapipe/python/solutions/pose.py b/mediapipe/python/solutions/pose.py new file mode 100644 index 0000000000..2f60be8b0a --- /dev/null +++ b/mediapipe/python/solutions/pose.py @@ -0,0 +1,213 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""MediaPipe Pose.""" + +import enum +from typing import NamedTuple + +import numpy as np + +# pylint: disable=unused-import +from mediapipe.calculators.core import gate_calculator_pb2 +from mediapipe.calculators.core import split_vector_calculator_pb2 +from mediapipe.calculators.tensor import image_to_tensor_calculator_pb2 +from mediapipe.calculators.tensor import inference_calculator_pb2 +from mediapipe.calculators.tensor import tensors_to_classification_calculator_pb2 +from mediapipe.calculators.tensor import tensors_to_detections_calculator_pb2 +from mediapipe.calculators.tensor import tensors_to_landmarks_calculator_pb2 +from mediapipe.calculators.util import detections_to_rects_calculator_pb2 +from mediapipe.calculators.util import landmarks_smoothing_calculator_pb2 +from mediapipe.calculators.util import logic_calculator_pb2 +from mediapipe.calculators.util import non_max_suppression_calculator_pb2 +from mediapipe.calculators.util import rect_transformation_calculator_pb2 +from mediapipe.calculators.util import thresholding_calculator_pb2 +# pylint: enable=unused-import +from mediapipe.python.solution_base import SolutionBase + + +class PoseLandmark(enum.IntEnum): + """The 25 (upper-body) pose landmarks.""" + NOSE = 0 + RIGHT_EYE_INNER = 1 + RIGHT_EYE = 2 + RIGHT_EYE_OUTER = 3 + LEFT_EYE_INNER = 4 + LEFT_EYE = 5 + LEFT_EYE_OUTER = 6 + RIGHT_EAR = 7 + LEFT_EAR = 8 + MOUTH_RIGHT = 9 + MOUTH_LEFT = 10 + RIGHT_SHOULDER = 11 + LEFT_SHOULDER = 12 + RIGHT_ELBOW = 13 + LEFT_ELBOW = 14 + RIGHT_WRIST = 15 + LEFT_WRIST = 16 + RIGHT_PINKY = 17 + LEFT_PINKY = 18 + RIGHT_INDEX = 19 + LEFT_INDEX = 20 + RIGHT_THUMB = 21 + LEFT_THUMB = 22 + RIGHT_HIP = 23 + LEFT_HIP = 24 + + +BINARYPB_FILE_PATH = 'mediapipe/modules/pose_landmark/pose_landmark_upper_body_smoothed_cpu.binarypb' +POSE_CONNECTIONS = frozenset([ + (PoseLandmark.NOSE, PoseLandmark.RIGHT_EYE_INNER), + (PoseLandmark.RIGHT_EYE_INNER, PoseLandmark.RIGHT_EYE), + (PoseLandmark.RIGHT_EYE, PoseLandmark.RIGHT_EYE_OUTER), + (PoseLandmark.RIGHT_EYE_OUTER, PoseLandmark.RIGHT_EAR), + (PoseLandmark.NOSE, PoseLandmark.LEFT_EYE_INNER), + (PoseLandmark.LEFT_EYE_INNER, PoseLandmark.LEFT_EYE), + (PoseLandmark.LEFT_EYE, PoseLandmark.LEFT_EYE_OUTER), + (PoseLandmark.LEFT_EYE_OUTER, PoseLandmark.LEFT_EAR), + (PoseLandmark.MOUTH_RIGHT, PoseLandmark.MOUTH_LEFT), + (PoseLandmark.RIGHT_SHOULDER, PoseLandmark.LEFT_SHOULDER), + (PoseLandmark.RIGHT_SHOULDER, PoseLandmark.RIGHT_ELBOW), + (PoseLandmark.RIGHT_ELBOW, PoseLandmark.RIGHT_WRIST), + (PoseLandmark.RIGHT_WRIST, PoseLandmark.RIGHT_PINKY), + (PoseLandmark.RIGHT_WRIST, PoseLandmark.RIGHT_INDEX), + (PoseLandmark.RIGHT_WRIST, PoseLandmark.RIGHT_THUMB), + (PoseLandmark.RIGHT_PINKY, PoseLandmark.RIGHT_INDEX), + (PoseLandmark.LEFT_SHOULDER, PoseLandmark.LEFT_ELBOW), + (PoseLandmark.LEFT_ELBOW, PoseLandmark.LEFT_WRIST), + (PoseLandmark.LEFT_WRIST, PoseLandmark.LEFT_PINKY), + (PoseLandmark.LEFT_WRIST, PoseLandmark.LEFT_INDEX), + (PoseLandmark.LEFT_WRIST, PoseLandmark.LEFT_THUMB), + (PoseLandmark.LEFT_PINKY, PoseLandmark.LEFT_INDEX), + (PoseLandmark.RIGHT_SHOULDER, PoseLandmark.RIGHT_HIP), + (PoseLandmark.LEFT_SHOULDER, PoseLandmark.LEFT_HIP), + (PoseLandmark.RIGHT_HIP, PoseLandmark.LEFT_HIP) +]) + + +class Pose(SolutionBase): + """MediaPipe Pose. + + MediaPipe Pose processes an RGB image and returns pose landmarks on the most + prominent person detected. + + Usage examples: + import cv2 + import mediapipe as mp + mp_drawing = mp.solutions.drawing_utils + mp_pose = mp.solutions.pose + + # For static images: + pose = mp_pose.Pose( + static_image_mode=True, min_detection_confidence=0.5) + for idx, file in enumerate(file_list): + image = cv2.imread(file) + # Convert the BGR image to RGB before processing. + results = pose.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + + # Print and draw pose landmarks on the image. + print( + 'nose landmark:', + results.pose_landmarks.landmark[mp_pose.PoseLandmark.NOSE]) + annotated_image = image.copy() + mp_drawing.draw_landmarks( + annotated_image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS) + cv2.imwrite('/tmp/annotated_image' + str(idx) + '.png', image) + pose.close() + + # For webcam input: + pose = mp_pose.Pose( + min_detection_confidence=0.5, min_tracking_confidence=0.5) + cap = cv2.VideoCapture(0) + while cap.isOpened(): + success, image = cap.read() + if not success: + break + + # Flip the image horizontally for a later selfie-view display, and convert + # the BGR image to RGB. + image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB) + # To improve performance, optionally mark the image as not writeable to + # pass by reference. + image.flags.writeable = False + results = pose.process(image) + + # Draw the pose annotation on the image. + image.flags.writeable = True + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + mp_drawing.draw_landmarks( + image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS) + cv2.imshow('MediaPipe Pose', image) + if cv2.waitKey(5) & 0xFF == 27: + break + pose.close() + cap.release() + """ + + def __init__(self, + static_image_mode=False, + min_detection_confidence=0.5, + min_tracking_confidence=0.5): + """Initializes a MediaPipe Pose object. + + Args: + static_image_mode: If set to False, the solution treats the input images + as a video stream. It will try to detect the most prominent person in + the very first images, and upon a successful detection further localizes + the pose landmarks. In subsequent images, it then simply tracks those + landmarks without invoking another detection until it loses track, on + reducing computation and latency. If set to True, person detection runs + every input image, ideal for processing a batch of static, possibly + unrelated, images. Default to False. + min_detection_confidence: Minimum confidence value ([0.0, 1.0]) from the + person-detection model for the detection to be considered successful. + Default to 0.5. + min_tracking_confidence: Minimum confidence value ([0.0, 1.0]) from the + landmark-tracking model for the pose landmarks to be considered tracked + successfully, or otherwise person detection will be invoked + automatically on the next input image. Setting it to a higher value can + increase robustness of the solution, at the expense of a higher latency. + Ignored if "static_image_mode" is True, where person detection simply + runs on every image. Default to 0.5. + """ + super().__init__( + binary_graph_path=BINARYPB_FILE_PATH, + side_inputs={ + 'can_skip_detection': not static_image_mode, + }, + calculator_params={ + 'poselandmarkupperbodycpu__posedetectioncpu__TensorsToDetectionsCalculator.min_score_thresh': + min_detection_confidence, + 'poselandmarkupperbodycpu__poselandmarkupperbodybyroicpu__ThresholdingCalculator.threshold': + min_tracking_confidence, + }, + outputs=['pose_landmarks']) + + def process(self, image: np.ndarray) -> NamedTuple: + """Processes an RGB image and returns the pose landmarks on the most prominent person detected. + + Args: + image: An RGB image represented as a numpy ndarray. + + Raises: + RuntimeError: If the underlying graph occurs any error. + ValueError: If the input image is not three channel RGB. + + Returns: + A NamedTuple object with a "pose_landmarks" field that contains the pose + landmarks on the most prominent person detected. + """ + + return super().process(input_data={'image': image}) diff --git a/mediapipe/util/tracking/parallel_invoker.h b/mediapipe/util/tracking/parallel_invoker.h index cc2f6600c9..823522310b 100644 --- a/mediapipe/util/tracking/parallel_invoker.h +++ b/mediapipe/util/tracking/parallel_invoker.h @@ -236,7 +236,7 @@ inline void CheckAndSetInvokerOptions() { LOG(WARNING) << "Unsupported invoker mode selected on Android. " << "OpenMP linkage detected, so falling back to OpenMP"; flags_parallel_invoker_mode = PARALLEL_INVOKER_OPENMP; -#else // _OPENMP +#else // _OPENMP // Fallback mode for active parallel invoker without OpenMP is ThreadPool. LOG(WARNING) << "Unsupported invoker mode selected on Android. " << "Falling back to ThreadPool"; @@ -273,7 +273,7 @@ inline void CheckAndSetInvokerOptions() { #endif // _OPENMP } -#else // PARALLEL_INVOKER_ACTIVE +#else // PARALLEL_INVOKER_ACTIVE if (flags_parallel_invoker_mode != PARALLEL_INVOKER_NONE) { LOG(ERROR) << "Parallel execution requested but PARALLEL_INVOKER_ACTIVE " << "compile flag is not set. Falling back to single threaded " diff --git a/mediapipe/util/tracking/region_flow_computation.cc b/mediapipe/util/tracking/region_flow_computation.cc index 89ee780aa9..88a954f5f9 100644 --- a/mediapipe/util/tracking/region_flow_computation.cc +++ b/mediapipe/util/tracking/region_flow_computation.cc @@ -2097,8 +2097,8 @@ void RegionFlowComputation::WideBaselineMatchFeatures( !defined(CV_WRAPPER_3X) LOG(FATAL) << "Supported on only with OpenCV 3.0. " << "Use bazel build flag : --define CV_WRAPPER=3X"; -#else // (defined(__ANDROID__) || defined(__APPLE__) || - // defined(__EMSCRIPTEN__)) && !defined(CV_WRAPPER_3X) +#else // (defined(__ANDROID__) || defined(__APPLE__) || + // defined(__EMSCRIPTEN__)) && !defined(CV_WRAPPER_3X) results->clear(); const auto& frame1 = from_data_ptr->frame; diff --git a/requirements.txt b/requirements.txt index 02b3ac75a0..cbd7535544 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ absl-py +dataclasses numpy opencv-python>=3.4.0,<4.0.0 protobuf>=3.11.4 diff --git a/setup.py b/setup.py index 1a67a31f73..32ccac55ff 100644 --- a/setup.py +++ b/setup.py @@ -15,9 +15,6 @@ Setup for MediaPipe package with setuptools. """ -from distutils import spawn -import distutils.command.build as build -import distutils.command.clean as clean import glob import os import posixpath @@ -29,6 +26,11 @@ import setuptools import setuptools.command.build_ext as build_ext import setuptools.command.install as install +# It is recommended to import setuptools prior to importing distutils to avoid +# using legacy behavior from distutils. +from distutils import spawn +import distutils.command.build as build +import distutils.command.clean as clean __version__ = '0.7' MP_ROOT_PATH = os.path.dirname(os.path.abspath(__file__)) @@ -39,6 +41,8 @@ MP_THIRD_PARTY_BUILD = os.path.join(MP_ROOT_PATH, 'third_party/BUILD') MP_THIRD_PARTY_BUILD_BACKUP = os.path.join(MP_ROOT_PATH, 'third_party/BUILD.backup') +MP_CALCULATORS_DIR_INIT_PY = os.path.join(MP_ROOT_PATH, + 'mediapipe/calculators/__init__.py') if not os.path.exists(ROOT_INIT_PY): open(ROOT_INIT_PY, 'w').close() @@ -53,7 +57,7 @@ def _parse_requirements(path): def _get_long_description(): - # fix the image urls. + # Fix the image urls. return re.sub( r'(docs/images/|docs/images/mobile/)([A-Za-z0-9_]*\.(png|gif))', r'https://github.com/google/mediapipe/blob/master/\g<1>\g<2>?raw=true', @@ -82,10 +86,11 @@ def _check_bazel(): sys.stderr.write('invalid bazel version number: %s\n' % version_segments) sys.exit(-1) bazel_version = int(''.join(['%03d' % int(seg) for seg in version_segments])) - if bazel_version < 3400000: + if bazel_version < 3004000: sys.stderr.write( 'the current bazel version is older than the minimum version that MediaPipe can support. Please upgrade bazel.' ) + sys.exit(-1) class ModifyInitFiles(setuptools.Command): @@ -103,10 +108,10 @@ def run(self): # Save the original init file. shutil.copyfile(MP_DIR_INIT_PY, MP_DIR_INIT_PY_BACKUP) mp_dir_init_file = open(MP_DIR_INIT_PY, 'a') - mp_dir_init_file.writelines([ - '\n', 'import mediapipe.examples.python as examples\n', - 'from mediapipe.python import *\n', '\n' - ]) + mp_dir_init_file.writelines( + ['\n', 'from mediapipe.python import *\n', + 'import mediapipe.python.solutions as solutions', + '\n']) mp_dir_init_file.close() @@ -132,19 +137,31 @@ def run(self): '-compiler\' (linux) or \'brew install protobuf\'(macos) to install ' 'protobuf compiler binary.') sys.exit(-1) - # Build framework protos. - for proto_file in glob.glob( - 'mediapipe/framework/**/*.proto', recursive=True): - if proto_file.endswith('test.proto'): - continue - proto_dir = os.path.dirname(os.path.abspath(proto_file)) - if proto_dir.endswith('testdata'): - continue - init_py = os.path.join(proto_dir, '__init__.py') - if not os.path.exists(init_py): - sys.stderr.write('adding necessary __init__ file: %s\n' % init_py) - open(init_py, 'w').close() - self._generate_proto(proto_file) + # Build framework and calculator protos. + if not os.path.exists(MP_CALCULATORS_DIR_INIT_PY): + sys.stderr.write('adding __init__ file: %s\n' % + MP_CALCULATORS_DIR_INIT_PY) + open(MP_CALCULATORS_DIR_INIT_PY, 'w').close() + for pattern in [ + 'mediapipe/framework/**/*.proto', 'mediapipe/calculators/**/*.proto', + 'mediapipe/gpu/**/*.proto', 'mediapipe/util/**/*.proto' + ]: + for proto_file in glob.glob(pattern, recursive=True): + # Ignore test protos. + if proto_file.endswith('test.proto'): + continue + # Ignore tensorflow protos. + if 'mediapipe/calculators/tensorflow' in proto_file: + continue + proto_dir = os.path.dirname(os.path.abspath(proto_file)) + # Ignore testdata dir. + if proto_dir.endswith('testdata'): + continue + init_py = os.path.join(proto_dir, '__init__.py') + if not os.path.exists(init_py): + sys.stderr.write('adding __init__ file: %s\n' % init_py) + open(init_py, 'w').close() + self._generate_proto(proto_file) def _generate_proto(self, source): """Invokes the Protocol Compiler to generate a _pb2.py.""" @@ -169,10 +186,14 @@ class BuildBinaryGraphs(build.build): def run(self): _check_bazel() - binary_graphs = ['pose_tracking/upper_body_pose_tracking_cpu_binary_graph'] + binary_graphs = [ + 'face_landmark/face_landmark_front_cpu', + 'hand_landmark/hand_landmark_tracking_cpu', + 'pose_landmark/pose_landmark_upper_body_smoothed_cpu' + ] for binary_graph in binary_graphs: sys.stderr.write('generating binarypb: %s\n' % - os.path.join('mediapipe/graphs/', binary_graph)) + os.path.join('mediapipe/modules/', binary_graph)) self._generate_binary_graph(binary_graph) def _generate_binary_graph(self, graph_path): @@ -184,14 +205,14 @@ def _generate_binary_graph(self, graph_path): '--compilation_mode=opt', '--define=MEDIAPIPE_DISABLE_GPU=1', '--action_env=PYTHON_BIN_PATH=' + sys.executable, - os.path.join('mediapipe/graphs/', graph_path), + os.path.join('mediapipe/modules/', graph_path), ] if subprocess.call(bazel_command) != 0: sys.exit(-1) - output_name = graph_path.replace('_binary_graph', '.binarypb') - output_file = os.path.join('mediapipe/graphs', output_name) + output_name = graph_path + '.binarypb' + output_file = os.path.join('mediapipe/modules', output_name) shutil.copyfile( - os.path.join('bazel-bin/mediapipe/graphs/', output_name), output_file) + os.path.join('bazel-bin/mediapipe/modules/', output_name), output_file) class BazelExtension(setuptools.Extension): @@ -320,7 +341,7 @@ def run(self): sys.stderr.write('removing generated files: %s\n' % py_file) os.remove(py_file) for binarypb_file in glob.glob( - 'mediapipe/graphs/**/*.binarypb', recursive=True): + 'mediapipe/modules/**/*.binarypb', recursive=True): sys.stderr.write('removing generated binary graphs: %s\n' % binarypb_file) os.remove(binarypb_file) # Restore the original init file from the backup. diff --git a/third_party/org_tensorflow_compatibility_fixes.diff b/third_party/org_tensorflow_compatibility_fixes.diff index 376384018e..2f965cf417 100644 --- a/third_party/org_tensorflow_compatibility_fixes.diff +++ b/third_party/org_tensorflow_compatibility_fixes.diff @@ -29,7 +29,7 @@ index ba50783765..5de5ea01f0 100644 +++ b/tensorflow/core/platform/test.h @@ -42,7 +42,6 @@ limitations under the License. #if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID) - #include "testing/base/public/gmock.h" + #include "testing/base/public/gmock.h" // IWYU pragma: export #else -#include #include