Easy Hand Tracking with MediaPipe and OSC Integration for Real-time Toys


MediaPipe is an open-source machine learning framework presented by Bazarevsky et al. at CVPR 2019.

https://github.com/google/mediapipe

The hand tracking feature used there combines a single-shot palm detection algorithm and a hand landmark model (Google AI Blog: On-Device, Real-Time Hand Tracking with MediaPipe).

The shape of the hand can be obtained as the coordinates of the following landmarks. Inference runs every frame, and values in cv coordinates can be obtained.

https://google.github.io/mediapipe/solutions/hands

The returned landmark object allows you to extract the float coordinates as follows:

1for hand_idx, landmarks in enumerate(multi_hand_landmarks):
2    for point_idx, points in enumerate(landmarks.landmark):
3        print(f"Hand: {hand_idx}, {HAND_LANDMARK_NAMES[point_idx]},"
4                      + f"x:{points.x} y:{points.y} z:{points.z}")

At this time, HAND_LANDMARK_NAMES is in the following order:

1HAND_LANDMARK_NAMES = [
2    "wrist",
3    "thumb_1",
4    "thumb_2",
5    "thumb_3",
6    "thumb_4",
7    "index_1",
8    "index_2",
9    "index_3",
10    "index_4",
11    "middle_1",
12    "middle_2",
13    "middle_3",
14    "middle_4",
15    "ring_1",
16    "ring_2",
17    "ring_3",
18    "ring_4",
19    "pinky_1",
20    "pinky_2",
21    "pinky_3",
22    "pinky_4"
23]

This time, we will prototype data transmission via Open Sound Control for interactive artworks or gesture recognition using Wekinator, etc., using this real-time hand tracking by MediaPipe.

Script

1# Atsuya Kobayashi 2020-12-22
2# Reference: https://google.github.io/mediapipe/solutions/hands
3# LICENCE: MIT
4
5from itertools import chain
6
7import mediapipe as mp
8from cv2 import cv2
9from pythonosc import udp_client
10
11IP = "127.0.0.1"
12PORT = 7474
13VIDEO_DEVICE_ID = 0
14RELATIVE_AXIS_MODE = True
15
16HAND_LANDMARK_NAMES = [
17    "wrist",
18    "thumb_1",
19    "thumb_2",
20    "thumb_3",
21    "thumb_4",
22    "index_1",
23    "index_2",
24    "index_3",
25    "index_4",
26    "middle_1",
27    "middle_2",
28    "middle_3",
29    "middle_4",
30    "ring_1",
31    "ring_2",
32    "ring_3",
33    "ring_4",
34    "pinky_1",
35    "pinky_2",
36    "pinky_3",
37    "pinky_4"
38]
39
40
41def extract_detected_hands_points(multi_hand_landmarks,
42                                  send_osc_client=None):
43
44    if multi_hand_landmarks is not None:
45        for hand_idx, landmarks in enumerate(multi_hand_landmarks):
46            for point_idx, points in enumerate(landmarks.landmark):
47
48                # if you want to check data on console
49                print(f"Hand: {hand_idx}, {HAND_LANDMARK_NAMES[point_idx]},"
50                      + f"x:{points.x} y:{points.y} z:{points.z}")
51                """
52                if you want to send data to addresses correspoding
53                to landmarks names on detected hands, use berow
54                """
55                # if send_osc_client is not None:
56                #     send_osc_client.send_message(f"/{HAND_LANDMARK_NAMES[point_idx]}",
57                #                                  [points.x, points.y])
58
59            """if you want to send data to single input address, use berow"""
60            if send_osc_client is not None:
61                send_osc_client.send_message(
62                    f"/YOUR_OSC_ADDRESS",
63                    list(chain.from_iterable([[p.x, p.y] for p in landmarks.landmark])))
64
65
66if __name__ == "__main__":
67
68    mp_drawing = mp.solutions.drawing_utils
69    mp_hands = mp.solutions.hands
70
71    hands = mp_hands.Hands(
72        min_detection_confidence=0.5, min_tracking_confidence=0.5)
73
74    cap = cv2.VideoCapture(VIDEO_DEVICE_ID)
75
76    osc_client = udp_client.SimpleUDPClient(IP, PORT)
77
78    while cap.isOpened():
79        success, image = cap.read()
80        if not success:
81            print("Ignoring empty camera frame.")
82            # If loading a video, use 'break' instead of 'continue'.
83            continue
84
85        image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
86        image.flags.writeable = False
87        results = hands.process(image)
88        extract_detected_hands_points(results.multi_hand_landmarks,
89                                      send_osc_client=osc_client)
90        image.flags.writeable = True
91        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
92        if results.multi_hand_landmarks:
93            for hand_landmarks in results.multi_hand_landmarks:
94                mp_drawing.draw_landmarks(
95                    image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
96        cv2.imshow('Detected Hands', image)
97
98        if cv2.waitKey(5) & 0xFF == 27:
99            break
100
101    hands.close()
102    cap.release()