Paper in WACV 2019 on “Eyemotion: Classifying Facial Expressions in VR Using Eye-Tracking Cameras”
Citation
Abstract
One of the main challenges of social interaction in virtual reality settings is that head-mounted displays occlude a large portion of the face, blocking facial expressions and thereby restricting social engagement cues among users. We present an algorithm to automatically infer expressions by analyzing only a partially occluded face while the user is engaged in a virtual reality experience. Specifically, we show that images of the user's eyes captured from an IR gaze-tracking camera within a VR headset are sufficient to infer a subset of facial expressions without the use of any fixed external camera. Using these inferences, we can generate dynamic avatars in real-time which function as an expressive surrogate for the user. We propose a novel data collection pipeline as well as a novel approach for increasing CNN accuracy via personalization. Our results show a mean accuracy of 74% (F1 of 0.73) among 5 'emotive' expressions and a mean accuracy of 70% (F1 of 0.68) among 10 distinct facial action units, outperforming human raters.
Links
- https://ieeexplore.ieee.org/document/8658392
- https://ai.google/research/pubs/pub46291
- doi:10.1109/WACV.2019.00178
BibTeX (Download)
@inproceedings{2019-Hickson-ECFEUEC, title = {Eyemotion: Classifying Facial Expressions in VR Using Eye-Tracking Cameras}, author = {S. Hickson and N. Dufour and A. Sud and V. Kwatra and I. Essa}, url = {https://ieeexplore.ieee.org/document/8658392 https://ai.google/research/pubs/pub46291}, doi = {10.1109/WACV.2019.00178}, issn = {1550-5790}, year = {2019}, date = {2019-01-01}, urldate = {2019-01-01}, booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)}, pages = {1626-1635}, abstract = {One of the main challenges of social interaction in virtual reality settings is that head-mounted displays occlude a large portion of the face, blocking facial expressions and thereby restricting social engagement cues among users. We present an algorithm to automatically infer expressions by analyzing only a partially occluded face while the user is engaged in a virtual reality experience. Specifically, we show that images of the user's eyes captured from an IR gaze-tracking camera within a VR headset are sufficient to infer a subset of facial expressions without the use of any fixed external camera. Using these inferences, we can generate dynamic avatars in real-time which function as an expressive surrogate for the user. We propose a novel data collection pipeline as well as a novel approach for increasing CNN accuracy via personalization. Our results show a mean accuracy of 74% (F1 of 0.73) among 5 'emotive' expressions and a mean accuracy of 70% (F1 of 0.68) among 10 distinct facial action units, outperforming human raters. }, keywords = {audio-video fusion, face & gesture, face processing, multimodal interfaces, WACV}, pubstate = {published}, tppubtype = {inproceedings} }