From 5cd8b1fbd0da892dc089b6a99c9fb9f5732cadb9 Mon Sep 17 00:00:00 2001
From: dizcza <dizcza@gmail.com>
Date: Tue, 22 Oct 2024 15:06:27 +0300
Subject: [PATCH 1/2] webcam demo

---
 rt_gene_webcam_demo/README.md      |   5 +
 rt_gene_webcam_demo/webcam_demo.py | 164 +++++++++++++++++++++++++++++
 2 files changed, 169 insertions(+)
 create mode 100644 rt_gene_webcam_demo/README.md
 create mode 100755 rt_gene_webcam_demo/webcam_demo.py

diff --git a/rt_gene_webcam_demo/README.md b/rt_gene_webcam_demo/README.md
new file mode 100644
index 0000000..4c2cc99
--- /dev/null
+++ b/rt_gene_webcam_demo/README.md
@@ -0,0 +1,5 @@
+# RT-GENE web cam demo
+
+See [rt_gene_standalone](../rt_gene_standalone) README.
+
+The `webcam_demo.py` script draws eye gaze vectors on a video stream from a webcam.
diff --git a/rt_gene_webcam_demo/webcam_demo.py b/rt_gene_webcam_demo/webcam_demo.py
new file mode 100755
index 0000000..02b133a
--- /dev/null
+++ b/rt_gene_webcam_demo/webcam_demo.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+
+# Licensed under Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode)
+
+from __future__ import print_function, division, absolute_import
+
+import argparse
+import os
+import sys
+
+import cv2
+import numpy as np
+from tqdm import tqdm
+
+from rt_gene.extract_landmarks_method_base import LandmarkMethodBase
+from rt_gene.gaze_tools import get_phi_theta_from_euler, limit_yaw, crop_face_from_image
+from rt_gene.gaze_tools_standalone import euler_from_matrix
+
+script_path = os.path.dirname(os.path.realpath(__file__))
+
+
+def load_camera_calibration(calibration_file):
+    import yaml
+    with open(calibration_file, 'r') as f:
+        cal = yaml.safe_load(f)
+
+    dist_coefficients = np.array(cal['distortion_coefficients']['data'], dtype='float32').reshape(1, 5)
+    camera_matrix = np.array(cal['camera_matrix']['data'], dtype='float32').reshape(3, 3)
+
+    return dist_coefficients, camera_matrix
+
+
+def estimate_gaze(color_img, dist_coefficients, camera_matrix):
+    faceboxes = landmark_estimator.get_face_bb(color_img)
+    if len(faceboxes) == 0:
+        tqdm.write('Could not find faces in the image')
+        return
+
+    subjects = landmark_estimator.get_subjects_from_faceboxes(color_img, faceboxes)
+
+    input_r_list = []
+    input_l_list = []
+    input_head_list = []
+    valid_subject_list = []
+
+    for idx, subject in enumerate(subjects):
+        le_c, re_c, le_bb, re_bb = subject.get_eye_image_from_landmarks(subject, landmark_estimator.eye_image_size)
+        subject.left_eye_color = le_c
+        subject.right_eye_color = re_c
+        subject.left_eye_bb = le_bb
+        subject.right_eye_bb = re_bb
+
+        if subject.left_eye_color is None or subject.right_eye_color is None:
+            tqdm.write('Failed to extract eye image patches')
+            continue
+
+        success, rotation_vector, _ = cv2.solvePnP(landmark_estimator.model_points,
+                                                   subject.landmarks.reshape(len(subject.landmarks), 1, 2),
+                                                   cameraMatrix=camera_matrix,
+                                                   distCoeffs=dist_coefficients, flags=cv2.SOLVEPNP_DLS)
+
+        if not success:
+            tqdm.write('Not able to extract head pose for subject {}'.format(idx))
+            continue
+
+        _rotation_matrix, _ = cv2.Rodrigues(rotation_vector)
+        _rotation_matrix = np.matmul(_rotation_matrix, np.array([[0, 1, 0], [0, 0, -1], [-1, 0, 0]]))
+        _m = np.zeros((4, 4))
+        _m[:3, :3] = _rotation_matrix
+        _m[3, 3] = 1
+        # Go from camera space to ROS space
+        _camera_to_ros = [[0.0, 0.0, 1.0, 0.0],
+                          [-1.0, 0.0, 0.0, 0.0],
+                          [0.0, -1.0, 0.0, 0.0],
+                          [0.0, 0.0, 0.0, 1.0]]
+        roll_pitch_yaw = list(euler_from_matrix(np.dot(_camera_to_ros, _m)))
+        roll_pitch_yaw = limit_yaw(roll_pitch_yaw)
+
+        phi_head, theta_head = get_phi_theta_from_euler(roll_pitch_yaw)
+
+        input_r_list.append(gaze_estimator.input_from_image(subject.right_eye_color))
+        input_l_list.append(gaze_estimator.input_from_image(subject.left_eye_color))
+        input_head_list.append([theta_head, phi_head])
+        valid_subject_list.append(idx)
+
+    if len(valid_subject_list) == 0:
+        return
+
+    gaze_est = gaze_estimator.estimate_gaze_twoeyes(inference_input_left_list=input_l_list,
+                                                    inference_input_right_list=input_r_list,
+                                                    inference_headpose_list=input_head_list)
+
+    for subject_id, gaze, headpose in zip(valid_subject_list, gaze_est, input_head_list):
+        subject = subjects[subject_id]
+
+        # Eye bounding boxes are relative to the facebox.
+        # Shift by the facebox position
+        x0, y0 = subject.box[:2]
+        shift_face_vec = np.array([x0, y0, x0, y0])
+        left_bb = subject.left_eye_bb + shift_face_vec
+        right_bb = subject.right_eye_bb + shift_face_vec
+
+        # Extract patch by reference and plot the eye gaze inplace
+        l_gaze_img = crop_face_from_image(color_img, left_bb)
+        r_gaze_img = crop_face_from_image(color_img, right_bb)
+        gaze_estimator.visualize_eye_result(l_gaze_img, gaze, inplace=True)
+        gaze_estimator.visualize_eye_result(r_gaze_img, gaze, inplace=True)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Estimate gaze from webcam')
+    parser.add_argument('--calib-file', type=str, dest='calib_file', default=None, help='Camera calibration file')
+    parser.add_argument('--gaze_backend', choices=['tensorflow', 'pytorch'], default='pytorch')
+    parser.add_argument('--models', nargs='+', type=str, default=[os.path.abspath(
+        os.path.join(script_path, '../rt_gene/model_nets/gaze_model_pytorch_vgg16_prl_mpii_allsubjects1.model'))],
+                        help='List of gaze estimators')
+    parser.add_argument('--device-id-facedetection', dest="device_id_facedetection", type=str, default='cuda:0',
+                        help='Pytorch device id. Set to "cpu:0" to disable cuda')
+    parser.add_argument('--webcam', dest="webcam", type=int, default=0,
+                        help='Webcam device number. Default is 0')
+    args = parser.parse_args()
+
+    tqdm.write('Loading networks')
+    landmark_estimator = LandmarkMethodBase(device_id_facedetection=args.device_id_facedetection,
+                                            checkpoint_path_face=os.path.abspath(os.path.join(script_path, "../rt_gene/model_nets/SFD/s3fd_facedetector.pth")),
+                                            checkpoint_path_landmark=os.path.abspath(
+                                                os.path.join(script_path, "../rt_gene/model_nets/phase1_wpdc_vdc.pth.tar")),
+                                            model_points_file=os.path.abspath(os.path.join(script_path, "../rt_gene/model_nets/face_model_68.txt")))
+
+    if args.gaze_backend == "tensorflow":
+        from rt_gene.estimate_gaze_tensorflow import GazeEstimator
+
+        gaze_estimator = GazeEstimator("/gpu:0", args.models)
+    elif args.gaze_backend == "pytorch":
+        from rt_gene.estimate_gaze_pytorch import GazeEstimator
+
+        gaze_estimator = GazeEstimator("cuda:0", args.models)
+    else:
+        raise ValueError("Incorrect gaze_base backend, choices are: tensorflow or pytorch")
+
+    cap = cv2.VideoCapture(args.webcam)
+
+    while True:
+        # load the input image and convert it to grayscale
+        _, image = cap.read()
+        image = cv2.flip(image, 1)
+
+        if args.calib_file is not None:
+            _dist_coefficients, _camera_matrix = load_camera_calibration(args.calib_file)
+        else:
+            im_width, im_height = image.shape[1], image.shape[0]
+            tqdm.write('WARNING!!! You should provide the camera calibration file, otherwise you might get bad results. Using a crude approximation!')
+            _dist_coefficients, _camera_matrix = np.zeros((1, 5)), np.array(
+                [[im_height, 0.0, im_width / 2.0], [0.0, im_height, im_height / 2.0], [0.0, 0.0, 1.0]])
+
+        estimate_gaze(image, _dist_coefficients, _camera_matrix)
+
+        cv2.imshow("Output", image)
+        k = cv2.waitKey(1) & 0xFF
+        if k == ord('q'):
+            break
+
+    cv2.destroyAllWindows()
+    cap.release()

From e7a4b43d6b9d4f532c99fa8f5db861a80fc84916 Mon Sep 17 00:00:00 2001
From: dizcza <dizcza@gmail.com>
Date: Tue, 22 Oct 2024 15:23:45 +0300
Subject: [PATCH 2/2] webcam demo show eye gaze in a separate window

---
 rt_gene_webcam_demo/webcam_demo.py | 73 ++++++++++++++++--------------
 1 file changed, 40 insertions(+), 33 deletions(-)

diff --git a/rt_gene_webcam_demo/webcam_demo.py b/rt_gene_webcam_demo/webcam_demo.py
index 02b133a..63c05e7 100755
--- a/rt_gene_webcam_demo/webcam_demo.py
+++ b/rt_gene_webcam_demo/webcam_demo.py
@@ -6,14 +6,13 @@
 
 import argparse
 import os
-import sys
 
 import cv2
 import numpy as np
 from tqdm import tqdm
 
 from rt_gene.extract_landmarks_method_base import LandmarkMethodBase
-from rt_gene.gaze_tools import get_phi_theta_from_euler, limit_yaw, crop_face_from_image
+from rt_gene.gaze_tools import get_phi_theta_from_euler, limit_yaw
 from rt_gene.gaze_tools_standalone import euler_from_matrix
 
 script_path = os.path.dirname(os.path.realpath(__file__))
@@ -30,6 +29,13 @@ def load_camera_calibration(calibration_file):
     return dist_coefficients, camera_matrix
 
 
+def extract_eye_image_patches(subjects):
+    for subject in subjects:
+        le_c, re_c, _, _ = subject.get_eye_image_from_landmarks(subject, landmark_estimator.eye_image_size)
+        subject.left_eye_color = le_c
+        subject.right_eye_color = re_c
+
+
 def estimate_gaze(color_img, dist_coefficients, camera_matrix):
     faceboxes = landmark_estimator.get_face_bb(color_img)
     if len(faceboxes) == 0:
@@ -37,6 +43,7 @@ def estimate_gaze(color_img, dist_coefficients, camera_matrix):
         return
 
     subjects = landmark_estimator.get_subjects_from_faceboxes(color_img, faceboxes)
+    extract_eye_image_patches(subjects)
 
     input_r_list = []
     input_l_list = []
@@ -44,12 +51,6 @@ def estimate_gaze(color_img, dist_coefficients, camera_matrix):
     valid_subject_list = []
 
     for idx, subject in enumerate(subjects):
-        le_c, re_c, le_bb, re_bb = subject.get_eye_image_from_landmarks(subject, landmark_estimator.eye_image_size)
-        subject.left_eye_color = le_c
-        subject.right_eye_color = re_c
-        subject.left_eye_bb = le_bb
-        subject.right_eye_bb = re_bb
-
         if subject.left_eye_color is None or subject.right_eye_color is None:
             tqdm.write('Failed to extract eye image patches')
             continue
@@ -78,6 +79,12 @@ def estimate_gaze(color_img, dist_coefficients, camera_matrix):
 
         phi_head, theta_head = get_phi_theta_from_euler(roll_pitch_yaw)
 
+        face_image_resized = cv2.resize(subject.face_color, dsize=(224, 224), interpolation=cv2.INTER_CUBIC)
+        head_pose_image = landmark_estimator.visualize_headpose_result(face_image_resized, (phi_head, theta_head))
+
+        if args.vis_headpose:
+            cv2.imshow("Head pose", head_pose_image)
+
         input_r_list.append(gaze_estimator.input_from_image(subject.right_eye_color))
         input_l_list.append(gaze_estimator.input_from_image(subject.left_eye_color))
         input_head_list.append([theta_head, phi_head])
@@ -90,38 +97,38 @@ def estimate_gaze(color_img, dist_coefficients, camera_matrix):
                                                     inference_input_right_list=input_r_list,
                                                     inference_headpose_list=input_head_list)
 
-    for subject_id, gaze, headpose in zip(valid_subject_list, gaze_est, input_head_list):
+    for subject_id, gaze, headpose in zip(valid_subject_list, gaze_est.tolist(), input_head_list):
         subject = subjects[subject_id]
+        # Build visualizations
+        r_gaze_img = gaze_estimator.visualize_eye_result(subject.right_eye_color, gaze)
+        l_gaze_img = gaze_estimator.visualize_eye_result(subject.left_eye_color, gaze)
+        s_gaze_img = np.concatenate((r_gaze_img, l_gaze_img), axis=1)
+        s_gaze_img = cv2.resize(s_gaze_img, (0, 0), fx=2.0, fy=2.0)
 
-        # Eye bounding boxes are relative to the facebox.
-        # Shift by the facebox position
-        x0, y0 = subject.box[:2]
-        shift_face_vec = np.array([x0, y0, x0, y0])
-        left_bb = subject.left_eye_bb + shift_face_vec
-        right_bb = subject.right_eye_bb + shift_face_vec
-
-        # Extract patch by reference and plot the eye gaze inplace
-        l_gaze_img = crop_face_from_image(color_img, left_bb)
-        r_gaze_img = crop_face_from_image(color_img, right_bb)
-        gaze_estimator.visualize_eye_result(l_gaze_img, gaze, inplace=True)
-        gaze_estimator.visualize_eye_result(r_gaze_img, gaze, inplace=True)
+        cv2.imshow("Eye gaze", s_gaze_img)
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Estimate gaze from webcam')
-    parser.add_argument('--calib-file', type=str, dest='calib_file', default=None, help='Camera calibration file')
-    parser.add_argument('--gaze_backend', choices=['tensorflow', 'pytorch'], default='pytorch')
-    parser.add_argument('--models', nargs='+', type=str, default=[os.path.abspath(
-        os.path.join(script_path, '../rt_gene/model_nets/gaze_model_pytorch_vgg16_prl_mpii_allsubjects1.model'))],
-                        help='List of gaze estimators')
-    parser.add_argument('--device-id-facedetection', dest="device_id_facedetection", type=str, default='cuda:0',
-                        help='Pytorch device id. Set to "cpu:0" to disable cuda')
+    parser = argparse.ArgumentParser(description='Estimate gaze from images')
     parser.add_argument('--webcam', dest="webcam", type=int, default=0,
                         help='Webcam device number. Default is 0')
+    parser.add_argument('--calib-file', type=str, dest='calib_file', default=None, help='Camera calibration file')
+    parser.add_argument('--vis-headpose', dest='vis_headpose', action='store_true', help='Display the head pose images')
+    parser.add_argument('--no-vis-headpose', dest='vis_headpose', action='store_false', help='Do not display the head pose images')
+    parser.add_argument('--gaze_backend', choices=['tensorflow', 'pytorch'], default='tensorflow')
+    parser.add_argument('--models', nargs='+', type=str, default=[os.path.abspath(os.path.join(script_path, '../rt_gene/model_nets/Model_allsubjects1.h5'))],
+                        help='List of gaze estimators')
+    parser.add_argument('--device-id-pytorch', dest="device_id_pytorch", type=str, default='cpu:0', help='Pytorch device id. Set to "cpu:0" to disable cuda')
+    parser.add_argument('--device-id-tensorflow', dest="device_id_tensorflow", type=str, default='/cpu:0', help='Tensorflow device id. Set to "/cpu:0" to disable cuda')
+
+    parser.set_defaults(vis_gaze=True)
+    parser.set_defaults(save_gaze=False)
+    parser.set_defaults(vis_headpose=False)
+
     args = parser.parse_args()
 
     tqdm.write('Loading networks')
-    landmark_estimator = LandmarkMethodBase(device_id_facedetection=args.device_id_facedetection,
+    landmark_estimator = LandmarkMethodBase(device_id_facedetection=args.device_id_pytorch,
                                             checkpoint_path_face=os.path.abspath(os.path.join(script_path, "../rt_gene/model_nets/SFD/s3fd_facedetector.pth")),
                                             checkpoint_path_landmark=os.path.abspath(
                                                 os.path.join(script_path, "../rt_gene/model_nets/phase1_wpdc_vdc.pth.tar")),
@@ -130,11 +137,11 @@ def estimate_gaze(color_img, dist_coefficients, camera_matrix):
     if args.gaze_backend == "tensorflow":
         from rt_gene.estimate_gaze_tensorflow import GazeEstimator
 
-        gaze_estimator = GazeEstimator("/gpu:0", args.models)
+        gaze_estimator = GazeEstimator(args.device_id_tensorflow, args.models)
     elif args.gaze_backend == "pytorch":
         from rt_gene.estimate_gaze_pytorch import GazeEstimator
 
-        gaze_estimator = GazeEstimator("cuda:0", args.models)
+        gaze_estimator = GazeEstimator(args.device_id_pytorch, args.models)
     else:
         raise ValueError("Incorrect gaze_base backend, choices are: tensorflow or pytorch")
 
@@ -155,7 +162,7 @@ def estimate_gaze(color_img, dist_coefficients, camera_matrix):
 
         estimate_gaze(image, _dist_coefficients, _camera_matrix)
 
-        cv2.imshow("Output", image)
+        cv2.imshow("Frame", image)
         k = cv2.waitKey(1) & 0xFF
         if k == ord('q'):
             break