@inproceedings{2024-Zhang-FFCSPTM, title = {FineStyle: Fine-grained Controllable Style Personalization for Text-to-image Models}, author = {Gong Zhang and Kihyuk Sohn and Meera Hahn and Humphrey Shi and Irfan Essa}, url = {https://neurips.cc/virtual/2024/poster/96863 https://openreview.net/forum?id=1SmXUGzrH8}, year = {2024}, date = {2024-12-11}, booktitle = {Advances in Neural Information Processing Systems (NeurIPS)}, abstract = {Few-shot fine-tuning of text-to-image (T2I) generation models enables people to create unique images in their own style using natural languages without requiring extensive prompt engineering. However, fine-tuning with only a handful, as little as one, of image-text paired data prevents fine-grained control of style attributes at generation. In this paper, we present FineStyle, a few-shot fine-tuning method that allows enhanced controllability for style personalized text-to-image generation. To overcome the lack of training data for fine-tuning, we propose a novel concept-oriented data scaling that amplifies the number of image-text pair, each of which focuses on different concepts (e.g., objects) in the style reference image. We also identify the benefit of parameter-efficient adapter tuning of key and value kernels of cross-attention layers. Extensive experiments show the effectiveness of FineStyle at following fine-grained text prompts and delivering visual quality faithful to the specified style, measured by CLIP scores and human raters. }, keywords = {computer vision, generative AI, generative media, machine learning, NeurIPS}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2024-Lee-PPMRLFTG, title = {Parrot: Pareto-optimal multi-reward reinforcement learning framework for text-to-image generation (inproceedings)}, author = {Seung Hyun Lee and Yinxiao Li and Junjie Ke and Innfarn Yoo and Han Zhang and Jiahui Yu and Qifei Wang and Fei Deng and Glenn Entis and Junfeng He and Gang Li and Sangpil Kim and Irfan Essa and Feng Yang }, url = {https://arxiv.org/abs/2401.05675 https://arxiv.org/pdf/2401.05675 https://dl.acm.org/doi/10.1007/978-3-031-72920-1_26}, doi = {10.48550/arXiv.2401.05675}, year = {2024}, date = {2024-07-25}, urldate = {2024-07-25}, booktitle = {Proceedings of European Conference on Computer Vision (ECCV) }, abstract = {Recent works have demonstrated that using reinforcement learning (RL) with multiple quality rewards can improve the quality of generated images in text-to-image (T2I) generation. However, manually adjusting reward weights poses challenges and may cause over-optimization in certain metrics. To solve this, we propose Parrot, which addresses the issue through multi-objective optimization and introduces an effective multi-reward optimization strategy to approximate Pareto optimal. Utilizing batch-wise Pareto optimal selection, Parrot automatically identifies the optimal trade-off among different rewards. We use the novel multi-reward optimization algorithm to jointly optimize the T2I model and a prompt expansion network, resulting in significant improvement of image quality and also allow to control the trade-off of different rewards using a reward related prompt during inference. Furthermore, we introduce original prompt-centered guidance at inference time, ensuring fidelity to user input after prompt expansion. Extensive experiments and a user study validate the superiority of Parrot over several baselines across various quality criteria, including aesthetics, human preference, text-image alignment, and image sentiment. }, keywords = {arXiv, computer vision, ECCV, generative AI, google, reinforcement learning}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2024-Gupta-PVGWDM, title = {Photorealistic Video Generation with Diffusion Models}, author = {Agrim Gupta and Lijun Yu and Kihyuk Sohn and Xiuye Gu and Meera Hahn and Li Fei-Fei and Irfan Essa and Lu Jiang and José Lezama }, url = {https://walt-video-diffusion.github.io/ https://arxiv.org/abs/2312.06662 https://arxiv.org/pdf/2312.06662 }, doi = {10.48550/arXiv.2312.06662}, year = {2024}, date = {2024-07-25}, urldate = {2024-07-25}, booktitle = {European Conference on Computer Vision (ECCV)}, abstract = {We present W.A.L.T, a transformer-based approach for photorealistic video generation via diffusion modeling. Our approach has two key design decisions. First, we use a causal encoder to jointly compress images and videos within a unified latent space, enabling training and generation across modalities. Second, for memory and training efficiency, we use a window attention architecture tailored for joint spatial and spatiotemporal generative modeling. Taken together these design decisions enable us to achieve state-of-the-art performance on established video (UCF-101 and Kinetics-600) and image (ImageNet) generation benchmarks without using classifier free guidance. Finally, we also train a cascade of three models for the task of text-to-video generation consisting of a base latent video diffusion model, and two video super-resolution diffusion models to generate videos of 512×896 resolution at 8 frames per second.}, keywords = {arXiv, computational video, computer vision, generative AI, google}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2024-Kondratyuk-VLLMZVG, title = {VideoPoet: A large language model for zero-shot video generation}, author = {Dan Kondratyuk and Lijun Yu and Xiuye Gu and José Lezama and Jonathan Huang and Grant Schindler and Rachel Hornung and Vighnesh Birodkar and Jimmy Yan and Ming-Chang Chiu and Krishna Somandepalli and Hassan Akbari and Yair Alon and Yong Cheng and Josh Dillon and Agrim Gupta and Meera Hahn and Anja Hauth and David Hendon and Alonso Martinez and David Minnen and Mikhail Sirotenko and Kihyuk Sohn and Xuan Yang and Hartwig Adam and Ming-Hsuan Yang and Irfan Essa and Huisheng Wang and David A. Ross and Bryan Seybold and Lu Jiang }, url = {https://arxiv.org/pdf/2312.14125 https://arxiv.org/abs/2312.14125 https://sites.research.google/videopoet/}, doi = {10.48550/arXiv.2312.14125}, year = {2024}, date = {2024-07-23}, urldate = {2024-07-23}, booktitle = {Proceedings of International Conference on Machine Learning (ICML)}, abstract = {We present VideoPoet, a language model capable of synthesizing high-quality video, with matching audio, from a large variety of conditioning signals. VideoPoet employs a decoder-only transformer architecture that processes multimodal inputs -- including images, videos, text, and audio. The training protocol follows that of Large Language Models (LLMs), consisting of two stages: pretraining and task-specific adaptation. During pretraining, VideoPoet incorporates a mixture of multimodal generative objectives within an autoregressive Transformer framework. The pretrained LLM serves as a foundation that can be adapted for a range of video generation tasks. We present empirical results demonstrating the model's state-of-the-art capabilities in zero-shot video generation, specifically highlighting VideoPoet's ability to generate high-fidelity motions. Project page: http://sites.research.google/videopoet/ }, keywords = {arXiv, best paper award, computational video, computer vision, generative AI, google, ICML}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2024-Xu-PDTTTDM, title = {Prompt-Free Diffusion: Taking "Text" out of Text-to-Image Diffusion Models}, author = {Xingqian Xu and Jiayi Guo and Zhangyang Wang and Gao Huang and Irfan Essa and Humphrey Shi }, url = {https://openaccess.thecvf.com/content/CVPR2024/papers/Xu_Prompt-Free_Diffusion_Taking_Text_out_of_Text-to-Image_Diffusion_Models_CVPR_2024_paper.pdf https://openaccess.thecvf.com/content/CVPR2024/html/Xu_Prompt-Free_Diffusion_Taking_Text_out_of_Text-to-Image_Diffusion_Models_CVPR_2024_paper.html https://arxiv.org/abs/2305.16223 }, doi = {10.48550/arXiv.2305.16223}, year = {2024}, date = {2024-06-18}, urldate = {2024-06-18}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) }, pages = {8682--8692}, abstract = {Text-to-image (T2I) research has grown explosively in the past year owing to the large-scale pre-trained diffusion models and many emerging personalization and editing approaches. Yet one pain point persists: the text prompt engineering and searching high-quality text prompts for customized results is more art than science. Moreover as commonly argued: "an image is worth a thousand words" - the attempt to describe a desired image with texts often ends up being ambiguous and cannot comprehensively cover delicate visual details hence necessitating more additional controls from the visual domain. In this paper we take a bold step forward: taking "Text" out of a pretrained T2I diffusion model to reduce the burdensome prompt engineering efforts for users. Our proposed framework Prompt-Free Diffusion relies on only visual inputs to generate new images: it takes a reference image as "context" an optional image structural conditioning and an initial noise with absolutely no text prompt. The core architecture behind the scene is Semantic Context Encoder (SeeCoder) substituting the commonly used CLIP-based or LLM-based text encoder. The reusability of SeeCoder also makes it a convenient drop-in component: one can also pre-train a SeeCoder in one T2I model and reuse it for another. Through extensive experiments Prompt-Free Diffusion is experimentally found to (i) outperform prior exemplar-based image synthesis approaches; (ii) perform on par with state-of-the-art T2I models using prompts following the best practice; and (iii) be naturally extensible to other downstream applications such as anime figure generation and virtual try-on with promising quality. Our code and models will be open-sourced. }, keywords = {arXiv, computer vision, CVPR, generative AI}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2024-Haresamudram-WMNFMDSHAR, title = {A Washing Machine is All You Need? On the Feasibility of Machine Data for Self-Supervised Human Activity Recognition}, author = {Harish Haresamudram and Irfan Essa and Thomas Plötz }, url = {https://ieeexplore.ieee.org/abstract/document/10651688}, doi = {10.1109/ABC61795.2024.10651688}, year = {2024}, date = {2024-05-24}, booktitle = {International Conference on Activity and Behavior Computing (ABC) 2024 }, abstract = {Learning representations via self-supervision has emerged as a powerful framework for deriving features for automatically recognizing activities using wearables. The current de-facto protocol involves performing pre-training on (large-scale) data recorded from human participants. This requires effort as recruiting participants and subsequently collecting data is both expensive and time-consuming. In this paper, we investigate the feasibility of an alternate source of data for its suitability to lead to useful representations, one that requires substantially lower effort for data collection. Specifically, we examine whether data collected by affixing sensors on running machinery, i.e., recording non-human movements/vibrations can also be utilized for self-supervised human activity recognition. We perform an extensive evaluation of utilizing data collected on a washing machine as the source and observe that state-of-the-art methods perform surprisingly well relative to when utilizing large-scale human movement data, obtaining within 5-6 % Fl-score on some target datasets, and exceeding on others. In scenarios with limited access to annotations, models trained on the washing-machine data perform comparably or better than end-to-end training, thereby indicating their feasibility and potential for recognizing activities. These results are significant and promising because they have the potential to substantially lower the efforts necessary for deriving effective wearables-based human activity recognition systems. }, keywords = {activity recognition, behavioral imaging, wearable computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2024-Yu-LMBDVG, title = {Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation}, author = {Lijun Yu and José Lezama and Nitesh B. Gundavarapu and Luca Versari and Kihyuk Sohn and David Minnen and Yong Cheng and Vighnesh Birodkar and Agrim Gupta and Xiuye Gu and Alexander G. Hauptmann and Boqing Gong and Ming-Hsuan Yang and Irfan Essa and David A. Ross and Lu Jiang}, url = {https://arxiv.org/abs/2310.05737 https://arxiv.org/pdf/2310.05737}, doi = { https://doi.org/10.48550/arXiv.2310.05737}, year = {2024}, date = {2024-05-14}, urldate = {2024-05-14}, booktitle = {Proceedings of International Conference on Learning Representations (ICLR) }, abstract = {While Large Language Models (LLMs) are the dominant models for generative tasks in language, they do not perform as well as diffusion models on image and video generation. To effectively use LLMs for visual generation, one crucial component is the visual tokenizer that maps pixel-space inputs to discrete tokens appropriate for LLM learning. In this paper, we introduce MAGVIT-v2, a video tokenizer designed to generate concise and expressive tokens for both videos and images using a common token vocabulary. Equipped with this new tokenizer, we show that LLMs outperform diffusion models on standard image and video generation benchmarks including ImageNet and Kinetics. In addition, we demonstrate that our tokenizer surpasses the previously top-performing video tokenizer on two more tasks: (1) video compression comparable to the next-generation video codec (VCC) according to human evaluations, and (2) learning effective representations for action recognition tasks. }, keywords = {AI, arXiv, computer vision, generative AI, google, ICLR}, pubstate = {published}, tppubtype = {inproceedings} } @article{2023-Haresamudram-TLDRSWHAR, title = {Towards Learning Discrete Representations via Self-Supervision for Wearables-Based Human Activity Recognition}, author = {Harish Haresamudram and Irfan Essa and Thomas Ploetz}, url = {https://arxiv.org/abs/2306.01108 https://www.mdpi.com/1424-8220/24/4/1238}, doi = {10.48550/arXiv.2306.01108}, year = {2024}, date = {2024-02-24}, urldate = {2023-06-01}, journal = {Sensors}, volume = {24}, number = {4}, abstract = {Human activity recognition (HAR) in wearable computing is typically based on direct processing of sensor data. Sensor readings are translated into representations, either derived through dedicated preprocessing, or integrated into end-to-end learning. Independent of their origin, for the vast majority of contemporary HAR, those representations are typically continuous in nature. That has not always been the case. In the early days of HAR, discretization approaches have been explored - primarily motivated by the desire to minimize computational requirements, but also with a view on applications beyond mere recognition, such as, activity discovery, fingerprinting, or large-scale search. Those traditional discretization approaches, however, suffer from substantial loss in precision and resolution in the resulting representations with detrimental effects on downstream tasks. Times have changed and in this paper we propose a return to discretized representations. We adopt and apply recent advancements in Vector Quantization (VQ) to wearables applications, which enables us to directly learn a mapping between short spans of sensor data and a codebook of vectors, resulting in recognition performance that is generally on par with their contemporary, continuous counterparts - sometimes surpassing them. Therefore, this work presents a proof-of-concept for demonstrating how effective discrete representations can be derived, enabling applications beyond mere activity classification but also opening up the field to advanced tools for the analysis of symbolic sequences, as they are known, for example, from domains such as natural language processing. Based on an extensive experimental evaluation on a suite of wearables-based benchmark HAR tasks, we demonstrate the potential of our learned discretization scheme and discuss how discretized sensor data analysis can lead to substantial changes in HAR.}, howpublished = {arXiv:2306.01108}, keywords = {activity recognition, arXiv, wearable computing}, pubstate = {published}, tppubtype = {article} } @inproceedings{2023-Sohn-STGS, title = {StyleDrop: Text-to-Image Generation in Any Style}, author = {Kihyuk Sohn and Nataniel Ruiz and Kimin Lee and Daniel Castro Chin and Irina Blok and Huiwen Chang and Jarred Barber and Lu Jiang and Glenn Entis and Yuanzhen Li and Yuan Hao and Irfan Essa and Michael Rubinstein and Dilip Krishnan}, url = {https://arxiv.org/abs/2306.00983 https://openreview.net/forum?id=KoaFh16uOc https://proceedings.neurips.cc/paper_files/paper/2023/hash/d33b177b69425e7685b0b1c05bd2a5e4-Abstract-Conference.html}, doi = {10.48550/arXiv.2306.00983}, year = {2023}, date = {2023-12-11}, urldate = {2023-12-11}, booktitle = {Advances in Neural Information Processing Systems (NeurIPS)}, abstract = {Pre-trained large text-to-image models synthesize impressive images with an appropriate use of text prompts. However, ambiguities inherent in natural language and out-of-distribution effects make it hard to synthesize image styles, that leverage a specific design pattern, texture or material. In this paper, we introduce StyleDrop, a method that enables the synthesis of images that faithfully follow a specific style using a text-to-image model. The proposed method is extremely versatile and captures nuances and details of a user-provided style, such as color schemes, shading, design patterns, and local and global effects. It efficiently learns a new style by fine-tuning very few trainable parameters (less than 1% of total model parameters) and improving the quality via iterative training with either human or automated feedback. Better yet, StyleDrop is able to deliver impressive results even when the user supplies only a single image that specifies the desired style. An extensive study shows that, for the task of style tuning text-to-image models, StyleDrop implemented on Muse convincingly outperforms other methods, including DreamBooth and textual inversion on Imagen or Stable Diffusion. More results are available at our project website: this https URL}, howpublished = {arXiv:2306.00983}, keywords = {arXiv, computer vision, generative AI, google, NeurIPS}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2023-Yu-SSPAMGWFL, title = {SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs}, author = {Lijun Yu and Yong Cheng and Zhiruo Wang and Vivek Kumar and Wolfgang Macherey and Yanping Huang and David A. Ross and Irfan Essa and Yonatan Bisk and Ming-Hsuan Yang and Kevin Murphy and Alexander G. Hauptmann and Lu Jiang}, url = {https://arxiv.org/abs/2306.17842 https://openreview.net/forum?id=CXPUg86A1D https://proceedings.neurips.cc/paper_files/paper/2023/hash/a526cc8f6ffb74bedb6ff313e3fdb450-Abstract-Conference.html}, doi = {10.48550/arXiv.2306.17842}, year = {2023}, date = {2023-12-11}, urldate = {2023-12-11}, booktitle = {Advances in Neural Information Processing Systems (NeurIPS)}, abstract = {In this work, we introduce Semantic Pyramid AutoEncoder (SPAE) for enabling frozen LLMs to perform both understanding and generation tasks involving non-linguistic modalities such as images or videos. SPAE converts between raw pixels and interpretable lexical tokens (or words) extracted from the LLM's vocabulary. The resulting tokens capture both the semantic meaning and the fine-grained details needed for visual reconstruction, effectively translating the visual content into a language comprehensible to the LLM, and empowering it to perform a wide array of multimodal tasks. Our approach is validated through in-context learning experiments with frozen PaLM 2 and GPT 3.5 on a diverse set of image understanding and generation tasks. Our method marks the first successful attempt to enable a frozen LLM to generate image content while surpassing state-of-the-art performance in image understanding tasks, under the same setting, by over 25%.}, howpublished = {Advances in Neural Information Processing Systems (NeurIPS) (arXiv:2306.17842v2)}, keywords = {arXiv, computational video, computer vision, generative AI, NeurIPS}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2023-Warner-TACIFUOVIS, title = {Text and Click inputs for unambiguous open vocabulary instance segmentation}, author = {Nikolai Warner and Meera Hahn and Jonathan Huang and Irfan Essa and Vighnesh Birodkar}, url = {https://doi.org/10.48550/arXiv.2311.14822 https://arxiv.org/abs/2311.14822 https://arxiv.org/pdf/2311.14822.pdf}, doi = {arXiv.2311.14822}, year = {2023}, date = {2023-11-24}, urldate = {2023-11-24}, booktitle = {Proeedings of British Conference for Machine Vision (BMVC)}, abstract = {Segmentation localizes objects in an image on a fine-grained per-pixel scale. Segmentation benefits by humans-in-the-loop to provide additional input of objects to segment using a combination of foreground or background clicks. Tasks include photoediting or novel dataset annotation, where human annotators leverage an existing segmentation model instead of drawing raw pixel level annotations. We propose a new segmentation process, Text + Click segmentation, where a model takes as input an image, a text phrase describing a class to segment, and a single foreground click specifying the instance to segment. Compared to previous approaches, we leverage open-vocabulary image-text models to support a wide-range of text prompts. Conditioning segmentations on text prompts improves the accuracy of segmentations on novel or unseen classes. We demonstrate that the combination of a single user-specified foreground click and a text prompt allows a model to better disambiguate overlapping or co-occurring semantic categories, such as "tie", "suit", and "person". We study these results across common segmentation datasets such as refCOCO, COCO, VOC, and OpenImages. Source code available here. }, keywords = {arXiv, BMVC, computer vision, google, image segmentation}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2023-Kumar-WIALDHRBULGIM, title = {Words into Action: Learning Diverse Humanoid Robot Behaviors using Language Guided Iterative Motion Refinement}, author = {K. Niranjan Kumar and Irfan Essa and Sehoon Ha}, url = {https://doi.org/10.48550/arXiv.2310.06226 https://arxiv.org/abs/2310.06226 https://arxiv.org/pdf/2310.06226.pdf https://www.kniranjankumar.com/words_into_action/ }, doi = {10.48550/arXiv.2310.06226}, year = {2023}, date = {2023-11-01}, urldate = {2023-11-01}, booktitle = {CoRL Workshop on Language and Robot Learning Language as Grounding (with CoRL 2023)}, abstract = {We present a method to simplify controller design by enabling users to train and fine-tune robot control policies using natural language commands. We first learn a neural network policy that generates behaviors given a natural language command, such as “walk forward”, by combining Large Language Models (LLMs), motion retargeting, and motion imitation. Based on the synthesized motion, we iteratively fine-tune by updating the text prompt and querying LLMs to find the best checkpoint associated with the closest motion in history.}, keywords = {arXiv, CoRL, robotics, vision & language}, pubstate = {published}, tppubtype = {inproceedings} } @article{2023-Kumar-CCRLCIB, title = {Cascaded Compositional Residual Learning for Complex Interactive Behaviors}, author = {K. Niranjan Kumar and Irfan Essa and Sehoon Ha}, url = {https://ieeexplore.ieee.org/document/10152471}, doi = {10.1109/LRA.2023.3286171}, year = {2023}, date = {2023-06-14}, urldate = {2023-06-14}, journal = {IEEE Robotics and Automation Letters}, volume = {8}, issue = {8}, pages = {4601--4608}, abstract = {Real-world autonomous missions often require rich interaction with nearby objects, such as doors or switches, along with effective navigation. However, such complex behaviors are difficult to learn because they involve both high-level planning and low-level motor control. We present a novel framework, Cascaded Compositional Residual Learning (CCRL), which learns composite skills by recursively leveraging a library of previously learned control policies. Our framework combines multiple levels of pre-learned skills by using multiplicative skill composition and residual action learning. We also introduce a goal synthesis network and an observation selector to support combination of heterogeneous skills, each with its unique goals and observation space. Finally, we develop residual regularization for learning policies that solve a new task, while preserving the style of the motion enforced by the skill library. We show that our framework learns joint-level control policies for a diverse set of motor skills ranging from basic locomotion to complex interactive navigation, including navigating around obstacles, pushing objects, crawling under a table, pushing a door open with its leg, and holding it open while walking through it. The proposed CCRL framework leads to policies with consistent styles and lower joint torques, and successfully transfer to a real Unitree A1 robot without any additional fine-tuning.}, keywords = {IEEE, reinforcement learning, robotics}, pubstate = {published}, tppubtype = {article} } @inproceedings{2023-Bashkirova-MUSMIG, title = {MaskSketch: Unpaired Structure-guided Masked Image Generation}, author = { Dina Bashkirova and José Lezama and Kihyuk Sohn and Kate Saenko and Irfan Essa}, url = {https://arxiv.org/abs/2302.05496 https://openaccess.thecvf.com/content/CVPR2023/papers/Bashkirova_MaskSketch_Unpaired_Structure-Guided_Masked_Image_Generation_CVPR_2023_paper.pdf https://openaccess.thecvf.com/content/CVPR2023/supplemental/Bashkirova_MaskSketch_Unpaired_Structure-Guided_CVPR_2023_supplemental.pdf}, doi = {10.48550/ARXIV.2302.05496}, year = {2023}, date = {2023-06-01}, urldate = {2023-06-01}, booktitle = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)}, abstract = {Recent conditional image generation methods produce images of remarkable diversity, fidelity and realism. However, the majority of these methods allow conditioning only on labels or text prompts, which limits their level of control over the generation result. In this paper, we introduce MaskSketch, an image generation method that allows spatial conditioning of the generation result using a guiding sketch as an extra conditioning signal during sampling. MaskSketch utilizes a pre-trained masked generative transformer, requiring no model training or paired supervision, and works with input sketches of different levels of abstraction. We show that intermediate self-attention maps of a masked generative transformer encode important structural information of the input image, such as scene layout and object shape, and we propose a novel sampling method based on this observation to enable structure-guided generation. Our results show that MaskSketch achieves high image realism and fidelity to the guiding structure. Evaluated on standard benchmark datasets, MaskSketch outperforms state-of-the-art methods for sketch-to-image translation, as well as unpaired image-to-image translation approaches.}, keywords = {computer vision, CVPR, generative AI, generative media, google}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2023-Yu-MMGVT, title = {MAGVIT: Masked Generative Video Transformer}, author = {Lijun Yu and Yong Cheng and Kihyuk Sohn and José Lezama and Han Zhang and Huiwen Chang and Alexander G. Hauptmann and Ming-Hsuan Yang and Yuan Hao and Irfan Essa and Lu Jiang}, url = {https://arxiv.org/abs/2212.05199 https://magvit.cs.cmu.edu/ https://openaccess.thecvf.com/content/CVPR2023/papers/Yu_MAGVIT_Masked_Generative_Video_Transformer_CVPR_2023_paper.pdf https://openaccess.thecvf.com/content/CVPR2023/supplemental/Yu_MAGVIT_Masked_Generative_CVPR_2023_supplemental.pdf}, doi = {10.48550/ARXIV.2212.05199}, year = {2023}, date = {2023-06-01}, urldate = {2023-06-01}, booktitle = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)}, abstract = {We introduce the MAsked Generative VIdeo Transformer, MAGVIT, to tackle various video synthesis tasks with a single model. We introduce a 3D tokenizer to quantize a video into spatial-temporal visual tokens and propose an embedding method for masked video token modeling to facilitate multi-task learning. We conduct extensive experiments to demonstrate the quality, efficiency, and flexibility of MAGVIT. Our experiments show that (i) MAGVIT performs favorably against state-of-the-art approaches and establishes the best-published FVD on three video generation benchmarks, including the challenging Kinetics-600. (ii) MAGVIT outperforms existing methods in inference time by two orders of magnitude against diffusion models and by 60x against autoregressive models. (iii) A single MAGVIT model supports ten diverse generation tasks and generalizes across videos from different visual domains. The source code and trained models will be released to the public at this https URL.}, keywords = {computational video, computer vision, CVPR, generative AI, generative media, google}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2022-Sohn-VPTGTL, title = {Visual Prompt Tuning for Generative Transfer Learning}, author = {Kihyuk Sohn and Yuan Hao and José Lezama and Luisa Polania and Huiwen Chang and Han Zhang and Irfan Essa and Lu Jiang}, url = {https://arxiv.org/abs/2210.00990 https://openaccess.thecvf.com/content/CVPR2023/papers/Sohn_Visual_Prompt_Tuning_for_Generative_Transfer_Learning_CVPR_2023_paper.pdf https://openaccess.thecvf.com/content/CVPR2023/supplemental/Sohn_Visual_Prompt_Tuning_CVPR_2023_supplemental.pdf}, doi = {10.48550/ARXIV.2210.00990}, year = {2023}, date = {2023-06-01}, urldate = {2023-06-01}, booktitle = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)}, abstract = {Transferring knowledge from an image synthesis model trained on a large dataset is a promising direction for learning generative image models from various domains efficiently. While previous works have studied GAN models, we present a recipe for learning vision transformers by generative knowledge transfer. We base our framework on state-of-the-art generative vision transformers that represent an image as a sequence of visual tokens to the autoregressive or non-autoregressive transformers. To adapt to a new domain, we employ prompt tuning, which prepends learnable tokens called prompt to the image token sequence, and introduce a new prompt design for our task. We study on a variety of visual domains, including visual task adaptation benchmark~citezhai2019large, with varying amount of training images, and show effectiveness of knowledge transfer and a significantly better image generation quality over existing works.}, keywords = {computer vision, CVPR, generative AI, generative media, google}, pubstate = {published}, tppubtype = {inproceedings} } @techreport{2023-Sohn-LDPCIS, title = {Learning Disentangled Prompts for Compositional Image Synthesis}, author = {Kihyuk Sohn and Albert Shaw and Yuan Hao and Han Zhang and Luisa Polania and Huiwen Chang and Lu Jiang and Irfan Essa}, url = {https://arxiv.org/abs/2306.00763}, doi = { https://doi.org/10.48550/arXiv.2306.00763}, year = {2023}, date = {2023-06-01}, urldate = {2023-06-01}, abstract = {We study domain-adaptive image synthesis, the problem of teaching pretrained image generative models a new style or concept from as few as one image to synthesize novel images, to better understand the compositional image synthesis. We present a framework that leverages a pre-trained class-conditional generation model and visual prompt tuning. Specifically, we propose a novel source class distilled visual prompt that learns disentangled prompts of semantic (e.g., class) and domain (e.g., style) from a few images. Learned domain prompt is then used to synthesize images of any classes in the style of target domain. We conduct studies on various target domains with the number of images ranging from one to a few to many, and show qualitative results which show the compositional generalization of our method. Moreover, we show that our method can help improve zero-shot domain adaptation classification accuracy. }, howpublished = {arXiv:2306.00763 }, keywords = {arXiv, computer vision, generative AI, google, prompt engineering}, pubstate = {published}, tppubtype = {techreport} } @inproceedings{2023-Lezama-DPDMIS, title = {Discrete Predictor-Corrector Diffusion Models for Image Synthesis}, author = {José Lezama and Tim Salimans and Lu Jiang and Huiwen Chang and Jonathan Ho and Irfan Essa}, url = {https://openreview.net/forum?id=VM8batVBWvg}, year = {2023}, date = {2023-05-01}, urldate = {2023-05-01}, booktitle = {International Conference on Learning Representations (ICLR)}, abstract = {We introduce Discrete Predictor-Corrector diffusion models (DPC), extending predictor-corrector samplers in Gaussian diffusion models to the discrete case. Predictor-corrector samplers are a class of samplers for diffusion models, which improve on ancestral samplers by correcting the sampling distribution of intermediate diffusion states using MCMC methods. In DPC, the Langevin corrector, which does not have a direct counterpart in discrete space, is replaced with a discrete MCMC transition defined by a learned corrector kernel. The corrector kernel is trained to make the correction steps achieve asymptotic convergence, in distribution, to the correct marginal of the intermediate diffusion states. Equipped with DPC, we revisit recent transformer-based non-autoregressive generative models through the lens of discrete diffusion, and find that DPC can alleviate the compounding decoding error due to the parallel sampling of visual tokens. Our experiments show that DPC improves upon existing discrete latent space models for class-conditional image generation on ImageNet, and outperforms continuous diffusion models and GANs, according to standard metrics and user preference studies}, keywords = {computer vision, generative AI, generative media, google, ICLR, machine learning}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2023-Wijmans-EMMBNA, title = {Emergence of Maps in the Memories of Blind Navigation Agents}, author = {Erik Wijmans and Manolis Savva and Irfan Essa and Stefan Lee and Ari S. Morcos and Dhruv Batra}, url = {https://arxiv.org/abs/2301.13261 https://wijmans.xyz/publication/eom/ https://openreview.net/forum?id=lTt4KjHSsyl https://blog.iclr.cc/2023/03/21/announcing-the-iclr-2023-outstanding-paper-award-recipients/}, doi = {10.48550/ARXIV.2301.13261}, year = {2023}, date = {2023-05-01}, urldate = {2023-05-01}, booktitle = {Proceedings of International Conference on Learning Representations (ICLR)}, abstract = {Animal navigation research posits that organisms build and maintain internal spatial representations, or maps, of their environment. We ask if machines -- specifically, artificial intelligence (AI) navigation agents -- also build implicit (or 'mental') maps. A positive answer to this question would (a) explain the surprising phenomenon in recent literature of ostensibly map-free neural-networks achieving strong performance, and (b) strengthen the evidence of mapping as a fundamental mechanism for navigation by intelligent embodied agents, whether they be biological or artificial. Unlike animal navigation, we can judiciously design the agent's perceptual system and control the learning paradigm to nullify alternative navigation mechanisms. Specifically, we train 'blind' agents -- with sensing limited to only egomotion and no other sensing of any kind -- to perform PointGoal navigation ('go to Δ x, Δ y') via reinforcement learning. Our agents are composed of navigation-agnostic components (fully-connected and recurrent neural networks), and our experimental setup provides no inductive bias towards mapping. Despite these harsh conditions, we find that blind agents are (1) surprisingly effective navigators in new environments (~95% success); (2) they utilize memory over long horizons (remembering ~1,000 steps of past experience in an episode); (3) this memory enables them to exhibit intelligent behavior (following walls, detecting collisions, taking shortcuts); (4) there is emergence of maps and collision detection neurons in the representations of the environment built by a blind agent as it navigates; and (5) the emergent maps are selective and task dependent (e.g. the agent 'forgets' exploratory detours). Overall, this paper presents no new techniques for the AI audience, but a surprising finding, an insight, and an explanation.}, keywords = {awards, best paper award, computer vision, google, ICLR, machine learning, robotics}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2023-Peng-SGASESDNA, title = {Slide Gestalt: Automatic Structure Extraction in Slide Decks for Non-Visual Access}, author = {Yi-Hao Peng and Peggy Chi and Anjuli Kannan and Meredith Morris and Irfan Essa}, url = {https://research.google/pubs/pub52182/ https://dl.acm.org/doi/fullHtml/10.1145/3544548.3580921 https://doi.org/10.1145/3544548.3580921 https://www.youtube.com/watch?v=pK08aMRx4qo}, year = {2023}, date = {2023-04-23}, urldate = {2023-04-23}, booktitle = {ACM Symposium on User Interface Software and Technology (UIST)}, abstract = {Presentation slides commonly use visual patterns for structural navigation, such as titles, dividers, and build slides. However, screen readers do not capture such intention, making it time-consuming and less accessible for blind and visually impaired (BVI) users to linearly consume slides with repeated content. We present Slide Gestalt, an automatic approach that identifies the hierarchical structure in a slide deck. Slide Gestalt computes the visual and textual correspondences between slides to generate hierarchical groupings. Readers can navigate the slide deck from the higher-level section overview to the lower-level description of a slide group or individual elements interactively with our UI. We derived side consumption and authoring practices from interviews with BVI readers and sighted creators and an analysis of 100 decks. We performed our pipeline with 50 real-world slide decks and a large dataset. Feedback from eight BVI participants showed that Slide Gestalt helped navigate a slide deck by anchoring content more efficiently, compared to using accessible slides.}, keywords = {accessibility, CHI, google, human-computer interaction}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2023-Samel-KRBINKILR, title = {Knowledge Relevance BERT: Integrating Noisy Knowledge into Language Representation.}, author = {Karan Samel and Jun Ma and Zhengyang Wang and Tong Zhao and Irfan Essa}, url = {https://knowledge-nlp.github.io/aaai2023/papers/005-KRBERT-oral.pdf}, year = {2023}, date = {2023-02-01}, urldate = {2023-02-01}, booktitle = {AAAI workshop on Knowledge Augmented Methods for NLP (KnowledgeNLP-AAAI 2023)}, abstract = {Integrating structured knowledge into language model representations increases recall of domain-specific information useful for downstream tasks. Matching between knowledge graph entities and text entity mentions can be easily performed when entity names are unique or entity-linking data exists. When extending this setting to new domains, newly mined knowledge contains ambiguous and incorrect information without explicit linking information. In such settings, we design a framework to robustly link relevant knowledge to input texts as an intermediate modeling step while performing end-to-end domain fine-tuning tasks. This is done by first computing the similarity of the existing task labels with candidate knowledge triplets to generate relevance labels. We use these labels to train a relevance model, which predicts the relevance of the inserted triplets to the original text. This relevance model is integrated within a language model, leading to our Knowledge Relevance BERT (KR-BERT) framework. We test KR-BERT for linking and ranking tasks on a real-world e-commerce dataset and a public entity linking task, where we show performance improvements over strong baselines.}, keywords = {AI, knowledge representation, NLP}, pubstate = {published}, tppubtype = {inproceedings} } @patent{2023-Zhang-IMTI, title = {Image manipulation by text instruction}, author = {Tianhao Zhang and Weilong Yang and Honglak Lee and Hung-Yu Tseng and Irfan Essa and Lu Jiang}, url = {https://patents.google.com/patent/US11562518}, year = {2023}, date = {2023-01-01}, urldate = {2023-01-01}, abstract = {A method for generating an output image from an input image and an input text instruction that specifies a location and a modification of an edit applied to the input image using a neural network is described. The neural network includes an image encoder, an image decoder, and an instruction attention network. The method includes receiving the input image and the input text instruction; extracting, from the input image, an input image feature that represents features of the input image using the image encoder; generating a spatial feature and a modification feature from the input text instruction using the instruction attention network; generating an edited image feature from the input image feature, the spatial feature and the modification feature; and generating the output image from the edited image feature using the image decoder.}, howpublished = {US Patent # US11562518}, keywords = {content creation, generative AI, google, media generation, patents}, pubstate = {published}, tppubtype = {patent} } @inproceedings{2022-Wijmans-TPNASCB, title = {How to Train PointGoal Navigation Agents on a (Sample and Compute) Budget}, author = {Erik Wijmans and Irfan Essa and Dhruv Batra}, url = {https://arxiv.org/abs/2012.06117 https://ifaamas.org/Proceedings/aamas2022/pdfs/p1762.pdf}, doi = {10.48550/arXiv.2012.06117}, year = {2022}, date = {2022-12-01}, urldate = {2020-12-01}, booktitle = {International Conference on Autonomous Agents and Multi-Agent Systems}, journal = {arXiv}, number = {arXiv:2012.06117}, abstract = {PointGoal navigation has seen significant recent interest and progress, spurred on by the Habitat platform and associated challenge. In this paper, we study PointGoal navigation under both a sample budget (75 million frames) and a compute budget (1 GPU for 1 day). We conduct an extensive set of experiments, cumulatively totaling over 50,000 GPU-hours, that let us identify and discuss a number of ostensibly minor but significant design choices -- the advantage estimation procedure (a key component in training), visual encoder architecture, and a seemingly minor hyper-parameter change. Overall, these design choices to lead considerable and consistent improvements over the baselines present in Savva et al. Under a sample budget, performance for RGB-D agents improves 8 SPL on Gibson (14% relative improvement) and 20 SPL on Matterport3D (38% relative improvement). Under a compute budget, performance for RGB-D agents improves by 19 SPL on Gibson (32% relative improvement) and 35 SPL on Matterport3D (220% relative improvement). We hope our findings and recommendations will make serve to make the community's experiments more efficient.}, keywords = {computer vision, embodied agents, navigation}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2022-Wijmans-SOLENER, title = {VER: Scaling On-Policy RL Leads to the Emergence of Navigation in Embodied Rearrangement}, author = {Erik Wijmans and Irfan Essa and Dhruv Batra}, editor = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho}, url = {https://arxiv.org/abs/2210.05064 https://openreview.net/forum?id=VrJWseIN98}, doi = {10.48550/ARXIV.2210.05064}, year = {2022}, date = {2022-12-01}, urldate = {2022-12-01}, booktitle = {Advances in Neural Information Processing Systems (NeurIPS)}, abstract = {We present Variable Experience Rollout (VER), a technique for efficiently scaling batched on-policy reinforcement learning in heterogenous environments (where different environments take vastly different times to generate rollouts) to many GPUs residing on, potentially, many machines. VER combines the strengths of and blurs the line between synchronous and asynchronous on-policy RL methods (SyncOnRL and AsyncOnRL, respectively). Specifically, it learns from on-policy experience (like SyncOnRL) and has no synchronization points (like AsyncOnRL) enabling high throughput. We find that VER leads to significant and consistent speed-ups across a broad range of embodied navigation and mobile manipulation tasks in photorealistic 3D simulation environments. Specifically, for PointGoal navigation and ObjectGoal navigation in Habitat 1.0, VER is 60-100% faster (1.6-2x speedup) than DD-PPO, the current state of art for distributed SyncOnRL, with similar sample efficiency. For mobile manipulation tasks (open fridge/cabinet, pick/place objects) in Habitat 2.0 VER is 150% faster (2.5x speedup) on 1 GPU and 170% faster (2.7x speedup) on 8 GPUs than DD-PPO. Compared to SampleFactory (the current state-of-the-art AsyncOnRL), VER matches its speed on 1 GPU, and is 70% faster (1.7x speedup) on 8 GPUs with better sample efficiency. We leverage these speed-ups to train chained skills for GeometricGoal rearrangement tasks in the Home Assistant Benchmark (HAB). We find a surprising emergence of navigation in skills that do not ostensible require any navigation. Specifically, the Pick skill involves a robot picking an object from a table. During training the robot was always spawned close to the table and never needed to navigate. However, we find that if base movement is part of the action space, the robot learns to navigate then pick an object in new environments with 50% success, demonstrating surprisingly high out-of-distribution generalization.}, keywords = {machine learning, NeurIPS, reinforcement learning, robotics}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2022-Alamri-EMRLVD, title = {End-to-end Multimodal Representation Learning for Video Dialog}, author = {Huda Alamri and Anthony Bilic and Michael Hu and Apoorva Beedu and Irfan Essa}, url = {https://arxiv.org/abs/2210.14512}, doi = {10.48550/arXiv.2210.14512}, year = {2022}, date = {2022-12-01}, urldate = {2022-12-01}, booktitle = {NeuRIPS Workshop on Vision Transformers: Theory and applications}, abstract = {Video-based dialog task is a challenging multimodal learning task that has received increasing attention over the past few years with state-of-the-art obtaining new performance records. This progress is largely powered by the adaptation of the more powerful transformer-based language encoders. Despite this progress, existing approaches do not effectively utilize visual features to help solve tasks. Recent studies show that state-of-the-art models are biased towards textual information rather than visual cues. In order to better leverage the available visual information, this study proposes a new framework that combines 3D-CNN network and transformer-based networks into a single visual encoder to extract more robust semantic representations from videos. The visual encoder is jointly trained end-to-end with other input modalities such as text and audio. Experiments on the AVSD task show significant improvement over baselines in both generative and retrieval tasks.}, keywords = {computational video, computer vision, vision transformers}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2022-Beedu-VBOPEUT, title = {Video based Object 6D Pose Estimation using Transformers}, author = {Apoorva Beedu and Huda Alamri and Irfan Essa}, url = {https://arxiv.org/abs/2210.13540}, doi = {10.48550/arXiv.2210.13540}, year = {2022}, date = {2022-12-01}, urldate = {2022-12-01}, booktitle = {NeuRIPS Workshop on Vision Transformers: Theory and applications}, abstract = {We introduce a Transformer based 6D Object Pose Estimation framework VideoPose, comprising an end-to-end attention based modelling architecture, that attends to previous frames in order to estimate accurate 6D Object Poses in videos. Our approach leverages the temporal information from a video sequence for pose refinement, along with being computationally efficient and robust. Compared to existing methods, our architecture is able to capture and reason from long-range dependencies efficiently, thus iteratively refining over video sequences.Experimental evaluation on the YCB-Video dataset shows that our approach is on par with the state-of-the-art Transformer methods, and performs significantly better relative to CNN based approaches. Further, with a speed of 33 fps, it is also more efficient and therefore applicable to a variety of applications that require real-time object pose estimation. Training code and pretrained models are available at https://anonymous.4open.science/r/VideoPose-3C8C.}, keywords = {computer vision, vision transformers}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2022-Lezama-IMIGWT, title = {Improved Masked Image Generation with Token-Critic}, author = {José Lezama and Huiwen Chang and Lu Jiang and Irfan Essa}, url = {https://arxiv.org/abs/2209.04439 https://rdcu.be/c61MZ}, doi = {10.1007/978-3-031-20050-2_5}, isbn = {978-3-031-20050-2}, year = {2022}, date = {2022-10-28}, urldate = {2022-10-28}, booktitle = {European Conference on Computer Vision (ECCV)}, volume = {13683}, publisher = {arXiv}, abstract = {Non-autoregressive generative transformers recently demonstrated impressive image generation performance, and orders of magnitude faster sampling than their autoregressive counterparts. However, optimal parallel sampling from the true joint distribution of visual tokens remains an open challenge. In this paper we introduce Token-Critic, an auxiliary model to guide the sampling of a non-autoregressive generative transformer. Given a masked-and-reconstructed real image, the Token-Critic model is trained to distinguish which visual tokens belong to the original image and which were sampled by the generative transformer. During non-autoregressive iterative sampling, Token-Critic is used to select which tokens to accept and which to reject and resample. Coupled with Token-Critic, a state-of-the-art generative transformer significantly improves its performance, and outperforms recent diffusion models and GANs in terms of the trade-off between generated image quality and diversity, in the challenging class-conditional ImageNet generation.}, keywords = {computer vision, ECCV, generative AI, generative media, google}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2022-Kong-BLTCLG, title = {BLT: Bidirectional Layout Transformer for Controllable Layout Generation}, author = {Xiang Kong and Lu Jiang and Huiwen Chang and Han Zhang and Yuan Hao and Haifeng Gong and Irfan Essa}, url = {https://arxiv.org/abs/2112.05112 https://rdcu.be/c61AE}, doi = {10.1007/978-3-031-19790-1_29}, isbn = {978-3-031-19789-5}, year = {2022}, date = {2022-10-25}, urldate = {2022-10-25}, booktitle = {European Conference on Computer Vision (ECCV)}, volume = {13677}, abstract = {Creating visual layouts is a critical step in graphic design. Automatic generation of such layouts is essential for scalable and diverse visual designs. To advance conditional layout generation, we introduce BLT, a bidirectional layout transformer. BLT differs from previous work on transformers in adopting non-autoregressive transformers. In training, BLT learns to predict the masked attributes by attending to surrounding attributes in two directions. During inference, BLT first generates a draft layout from the input and then iteratively refines it into a high-quality layout by masking out low-confident attributes. The masks generated in both training and inference are controlled by a new hierarchical sampling policy. We verify the proposed model on six benchmarks of diverse design tasks. Experimental results demonstrate two benefits compared to the state-of-the-art layout transformer models. First, our model empowers layout transformers to fulfill controllable layout generation. Second, it achieves up to 10x speedup in generating a layout at inference time than the layout transformer baseline. Code is released at https://shawnkx.github.io/blt.}, keywords = {computer vision, ECCV, generative AI, generative media, google, vision transformer}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2022-Chi-SVPFD, title = {Synthesis-Assisted Video Prototyping From a Document}, author = {Peggy Chi and Tao Dong and Christian Frueh and Brian Colonna and Vivek Kwatra and Irfan Essa}, url = {https://research.google/pubs/pub51631/ https://dl.acm.org/doi/abs/10.1145/3526113.3545676}, doi = {10.1145/3526113.3545676}, year = {2022}, date = {2022-10-01}, urldate = {2022-10-01}, booktitle = {Proceedings of the 35th Annual ACM Symposium on User Interface Software and Technology}, pages = {1--10}, abstract = {Video productions commonly start with a script, especially for talking head videos that feature a speaker narrating to the camera. When the source materials come from a written document -- such as a web tutorial, it takes iterations to refine content from a text article to a spoken dialogue, while considering visual compositions in each scene. We propose Doc2Video, a video prototyping approach that converts a document to interactive scripting with a preview of synthetic talking head videos. Our pipeline decomposes a source document into a series of scenes, each automatically creating a synthesized video of a virtual instructor. Designed for a specific domain -- programming cookbooks, we apply visual elements from the source document, such as a keyword, a code snippet or a screenshot, in suitable layouts. Users edit narration sentences, break or combine sections, and modify visuals to prototype a video in our Editing UI. We evaluated our pipeline with public programming cookbooks. Feedback from professional creators shows that our method provided a reasonable starting point to engage them in interactive scripting for a narrated instructional video.}, keywords = {computational video, generative media, google, human-computer interaction, UIST, video editing}, pubstate = {published}, tppubtype = {inproceedings} } @article{2022-Haresamudram-ASSHARUW, title = {Assessing the State of Self-Supervised Human Activity Recognition using Wearables}, author = {Harish Haresamudram and Irfan Essa and Thomas Ploetz}, url = {https://dl.acm.org/doi/10.1145/3550299 https://arxiv.org/abs/2202.12938 https://arxiv.org/pdf/2202.12938 }, doi = {doi.org/10.1145/3550299}, year = {2022}, date = {2022-09-07}, urldate = {2022-09-07}, booktitle = {Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT)}, journal = {Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT)}, volume = {6}, number = {116}, issue = {3}, pages = {1–47}, publisher = {ACM}, abstract = {The emergence of self-supervised learning in the field of wearables-based human activity recognition (HAR) has opened up opportunities to tackle the most pressing challenges in the field, namely to exploit unlabeled data to derive reliable recognition systems for scenarios where only small amounts of labeled training samples can be collected. As such, self-supervision, i.e., the paradigm of 'pretrain-then-finetune' has the potential to become a strong alternative to the predominant end-to-end training approaches, let alone hand-crafted features for the classic activity recognition chain. Recently a number of contributions have been made that introduced self-supervised learning into the field of HAR, including, Multi-task self-supervision, Masked Reconstruction, CPC, and SimCLR, to name but a few. With the initial success of these methods, the time has come for a systematic inventory and analysis of the potential self-supervised learning has for the field. This paper provides exactly that. We assess the progress of self-supervised HAR research by introducing a framework that performs a multi-faceted exploration of model performance. We organize the framework into three dimensions, each containing three constituent criteria, such that each dimension captures specific aspects of performance, including the robustness to differing source and target conditions, the influence of dataset characteristics, and the feature space characteristics. We utilize this framework to assess seven state-of-the-art self-supervised methods for HAR, leading to the formulation of insights into the properties of these techniques and to establish their value towards learning representations for diverse scenarios. }, keywords = {activity recognition, IMWUT, ubiquitous computing, wearable computing}, pubstate = {published}, tppubtype = {article} } @inproceedings{2022-Nkemelu-THSLLWCE, title = {Tackling Hate Speech in Low-resource Languages with Context Experts}, author = {Daniel Nkemelu and Harshil Shah and Irfan Essa and Michael L. Best}, url = {https://www.nkemelu.com/data/ictd2022_nkemelu_final.pdf }, year = {2022}, date = {2022-06-01}, urldate = {2022-06-01}, booktitle = {International Conference on Information & Communication Technologies and Development (ICTD)}, abstract = {Given Myanmar's historical and socio-political context, hate speech spread on social media have escalated into offline unrest and violence. This paper presents findings from our remote study on the automatic detection of hate speech online in Myanmar. We argue that effectively addressing this problem will require community-based approaches that combine the knowledge of context experts with machine learning tools that can analyze the vast amount of data produced. To this end, we develop a systematic process to facilitate this collaboration covering key aspects of data collection, annotation, and model validation strategies. We highlight challenges in this area stemming from small and imbalanced datasets, the need to balance non-glamorous data work and stakeholder priorities, and closed data sharing practices. Stemming from these findings, we discuss avenues for further work in developing and deploying hate speech detection systems for low-resource languages.}, keywords = {computational journalism, ICTD, social computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2021-Kumar-GCSGIEUDRL, title = {Graph-based Cluttered Scene Generation and Interactive Exploration using Deep Reinforcement Learning}, author = {Niranjan Kumar and Irfan Essa and Sehoon Ha}, url = {https://doi.org/10.1109/ICRA46639.2022.9811874 https://arxiv.org/abs/2109.10460 https://arxiv.org/pdf/2109.10460 https://www.kniranjankumar.com/projects/5_clutr https://kniranjankumar.github.io/assets/pdf/graph_based_clutter.pdf https://youtu.be/T2Jo7wwaXss}, doi = {10.1109/ICRA46639.2022.9811874}, year = {2022}, date = {2022-05-01}, urldate = {2022-05-01}, booktitle = {Proceedings International Conference on Robotics and Automation (ICRA)}, journal = {arXiv}, number = {2109.10460}, pages = {7521-7527}, abstract = {We introduce a novel method to teach a robotic agent to interactively explore cluttered yet structured scenes, such as kitchen pantries and grocery shelves, by leveraging the physical plausibility of the scene. We propose a novel learning framework to train an effective scene exploration policy to discover hidden objects with minimal interactions. First, we define a novel scene grammar to represent structured clutter. Then we train a Graph Neural Network (GNN) based Scene Generation agent using deep reinforcement learning (deep RL), to manipulate this Scene Grammar to create a diverse set of stable scenes, each containing multiple hidden objects. Given such cluttered scenes, we then train a Scene Exploration agent, using deep RL, to uncover hidden objects by interactively rearranging the scene. }, keywords = {ICRA, machine learning, reinforcement learning, robotics}, pubstate = {published}, tppubtype = {inproceedings} } @article{2022-Samel-LTRFNTD, title = {Learning Temporal Rules from Noisy Timeseries Data}, author = {Karan Samel and Zelin Zhao and Binghong Chen and Shuang Li and Dharmashankar Subramanian and Irfan Essa and Le Song}, url = {https://arxiv.org/abs/2202.05403 https://arxiv.org/pdf/2202.05403}, year = {2022}, date = {2022-02-01}, urldate = {2022-02-01}, journal = {arXiv preprint arXiv:2202.05403}, abstract = {Events across a timeline are a common data representation, seen in different temporal modalities. Individual atomic events can occur in a certain temporal ordering to compose higher level composite events. Examples of a composite event are a patient's medical symptom or a baseball player hitting a home run, caused distinct temporal orderings of patient vitals and player movements respectively. Such salient composite events are provided as labels in temporal datasets and most works optimize models to predict these composite event labels directly. We focus on uncovering the underlying atomic events and their relations that lead to the composite events within a noisy temporal data setting. We propose Neural Temporal Logic Programming (Neural TLP) which first learns implicit temporal relations between atomic events and then lifts logic rules for composite events, given only the composite events labels for supervision. This is done through efficiently searching through the combinatorial space of all temporal logic rules in an end-to-end differentiable manner. We evaluate our method on video and healthcare datasets where it outperforms the baseline methods for rule discovery. }, keywords = {activity recognition, machine learning}, pubstate = {published}, tppubtype = {article} } @inproceedings{2022-Mao-DRSVTR, title = {Discrete Representations Strengthen Vision Transformer Robustness}, author = {Chengzhi Mao and Lu Jiang and Mostafa Dehghani and Carl Vondrick and Rahul Sukthankar and Irfan Essa}, url = {https://iclr.cc/virtual/2022/poster/6647 https://arxiv.org/abs/2111.10493 https://research.google/pubs/pub51388/ https://openreview.net/forum?id=8hWs60AZcWk}, doi = {10.48550/arXiv.2111.10493}, year = {2022}, date = {2022-01-28}, urldate = {2022-04-01}, booktitle = {Proceedings of International Conference on Learning Representations (ICLR)}, journal = {arXiv preprint arXiv:2111.10493}, abstract = {Vision Transformer (ViT) is emerging as the state-of-the-art architecture for image recognition. While recent studies suggest that ViTs are more robust than their convolutional counterparts, our experiments find that ViTs trained on ImageNet are overly reliant on local textures and fail to make adequate use of shape information. ViTs thus have difficulties generalizing to out-of-distribution, real-world data. To address this deficiency, we present a simple and effective architecture modification to ViT's input layer by adding discrete tokens produced by a vector-quantized encoder. Different from the standard continuous pixel tokens, discrete tokens are invariant under small perturbations and contain less information individually, which promote ViTs to learn global information that is invariant. Experimental results demonstrate that adding discrete representation on four architecture variants strengthens ViT robustness by up to 12% across seven ImageNet robustness benchmarks while maintaining the performance on ImageNet.}, keywords = {computer vision, google, machine learning, vision transformer}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2022-Hickson-SDNFMPP, title = {Sharing Decoders: Network Fission for Multi-Task Pixel Prediction}, author = {Steven Hickson and Karthik Raveendran and Irfan Essa}, url = {https://openaccess.thecvf.com/content/WACV2022/papers/Hickson_Sharing_Decoders_Network_Fission_for_Multi-Task_Pixel_Prediction_WACV_2022_paper.pdf https://openaccess.thecvf.com/content/WACV2022/supplemental/Hickson_Sharing_Decoders_Network_WACV_2022_supplemental.pdf https://youtu.be/qqYODA4C6AU}, doi = {10.1109/WACV51458.2022.00371}, year = {2022}, date = {2022-01-01}, urldate = {2022-01-01}, booktitle = {IEEE/CVF Winter Conference on Applications of Computer Vision}, pages = {3771--3780}, abstract = {We examine the benefits of splitting encoder-decoders for multitask learning and showcase results on three tasks (semantics, surface normals, and depth) while adding very few FLOPS per task. Current hard parameter sharing methods for multi-task pixel-wise labeling use one shared encoder with separate decoders for each task. We generalize this notion and term the splitting of encoder-decoder architectures at different points as fission. Our ablation studies on fission show that sharing most of the decoder layers in multi-task encoder-decoder networks results in improvement while adding far fewer parameters per task. Our proposed method trains faster, uses less memory, results in better accuracy, and uses significantly fewer floating point operations (FLOPS) than conventional multi-task methods, with additional tasks only requiring 0.017% more FLOPS than the single-task network.}, keywords = {computer vision, google, machine learning}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2022-Kumar-CCRLCIB, title = {Cascaded Compositional Residual Learning for Complex Interactive Behaviors}, author = {Niranjan Kumar and Irfan Essa and Sehoon Ha}, url = {https://arxiv.org/abs/2212.08954 https://www.kniranjankumar.com/ccrl/static/pdf/paper.pdf https://youtu.be/fAklIxiK7Qg }, doi = {10.48550/ARXIV.2212.08954}, year = {2022}, date = {2022-01-01}, urldate = {2022-01-01}, booktitle = {Sim-to-Real Robot Learning: Locomotion and Beyond Workshop at the Conference on Robot Learning (CoRL)}, publisher = {arXiv}, abstract = {Real-world autonomous missions often require rich interaction with nearby objects, such as doors or switches, along with effective navigation. However, such complex behaviors are difficult to learn because they involve both high-level planning and low-level motor control. We present a novel framework, Cascaded Compositional Residual Learning (CCRL), which learns composite skills by recursively leveraging a library of previously learned control policies. Our framework learns multiplicative policy composition, task-specific residual actions, and synthetic goal information simultaneously while freezing the prerequisite policies. We further explicitly control the style of the motion by regularizing residual actions. We show that our framework learns joint-level control policies for a diverse set of motor skills ranging from basic locomotion to complex interactive navigation, including navigating around obstacles, pushing objects, crawling under a table, pushing a door open with its leg, and holding it open while walking through it. The proposed CCRL framework leads to policies with consistent styles and lower joint torques, which we successfully transfer to a real Unitree A1 robot without any additional fine-tuning.}, keywords = {reinforcement learning, robotics}, pubstate = {published}, tppubtype = {inproceedings} } @techreport{2021-Beedu-VEOPFV, title = {VideoPose: Estimating 6D object pose from videos}, author = {Apoorva Beedu and Zhile Ren and Varun Agrawal and Irfan Essa}, url = {https://arxiv.org/abs/2111.10677}, doi = {10.48550/arXiv.2111.10677}, year = {2021}, date = {2021-11-01}, urldate = {2021-11-01}, journal = {arXiv preprint arXiv:2111.10677}, abstract = {We introduce a simple yet effective algorithm that uses convolutional neural networks to directly estimate object poses from videos. Our approach leverages the temporal information from a video sequence, and is computationally efficient and robust to support robotic and AR domains. Our proposed network takes a pre-trained 2D object detector as input, and aggregates visual features through a recurrent neural network to make predictions at each frame. Experimental evaluation on the YCB-Video dataset show that our approach is on par with the state-of-the-art algorithms. Further, with a speed of 30 fps, it is also more efficient than the state-of-the-art, and therefore applicable to a variety of applications that require real-time object pose estimation.}, keywords = {arXiv, computer vision, object detection, pose estimation}, pubstate = {published}, tppubtype = {techreport} } @inproceedings{2021-Zhang-TNOIMTI, title = {Text as Neural Operator: Image Manipulation by Text Instruction}, author = {Tianhao Zhang and Hung-Yu Tseng and Lu Jiang and Weilong Yang and Honglak Lee and Irfan Essa}, url = {https://dl.acm.org/doi/10.1145/3474085.3475343 https://arxiv.org/abs/2008.04556}, doi = {10.1145/3474085.3475343}, year = {2021}, date = {2021-10-01}, urldate = {2021-10-01}, booktitle = {ACM International Conference on Multimedia (ACM-MM)}, publisher = {ACM Press}, abstract = {In recent years, text-guided image manipulation has gained increasing attention in the multimedia and computer vision community. The input to conditional image generation has evolved from image-only to multimodality. In this paper, we study a setting that allows users to edit an image with multiple objects using complex text instructions to add, remove, or change the objects. The inputs of the task are multimodal including (1) a reference image and (2) an instruction in natural language that describes desired modifications to the image. We propose a GAN-based method to tackle this problem. The key idea is to treat text as neural operators to locally modify the image feature. We show that the proposed model performs favorably against recent strong baselines on three public datasets. Specifically, it generates images of greater fidelity and semantic relevance, and when used as a image query, leads to better retrieval performance.}, keywords = {computer vision, generative media, google, multimedia}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2021-Chi-AIVCFMT, title = {Automatic Instructional Video Creation from a Markdown-Formatted Tutorial}, author = {Peggy Chi and Nathan Frey and Katrina Panovich and Irfan Essa}, url = {https://doi.org/10.1145/3472749.3474778 https://research.google/pubs/pub50745/ https://youtu.be/WmrZ7PUjyuM}, doi = {10.1145/3472749.3474778}, year = {2021}, date = {2021-10-01}, urldate = {2021-10-01}, booktitle = {ACM Symposium on User Interface Software and Technology (UIST)}, publisher = {ACM Press}, abstract = {We introduce HowToCut, an automatic approach that converts a Markdown-formatted tutorial into an interactive video that presents the visual instructions with a synthesized voiceover for narration. HowToCut extracts instructional content from a multimedia document that describes a step-by-step procedure. Our method selects and converts text instructions to a voiceover. It makes automatic editing decisions to align the narration with edited visual assets, including step images, videos, and text overlays. We derive our video editing strategies from an analysis of 125 web tutorials and apply Computer Vision techniques to the assets. To enable viewers to interactively navigate the tutorial, HowToCut's conversational UI presents instructions in multiple formats upon user commands. We evaluated our automatically-generated video tutorials through user studies (N=20) and validated the video quality via an online survey (N=93). The evaluation shows that our method was able to effectively create informative and useful instructional videos from a web tutorial document for both reviewing and following.}, keywords = {google, human-computer interaction, UIST, video editting}, pubstate = {published}, tppubtype = {inproceedings} } @techreport{2021-Samel-NTLP, title = {Neural Temporal Logic Programming}, author = {Karan Samel and Zelin Zhao and Binghong Chen and Shuang Li and Dharmashankar Subramanian and Irfan Essa and Le Song}, url = {https://openreview.net/forum?id=i7h4M45tU8}, year = {2021}, date = {2021-09-01}, urldate = {2021-09-01}, abstract = {Events across a timeline are a common data representation, seen in different temporal modalities. Individual atomic events can occur in a certain temporal ordering to compose higher-level composite events. Examples of a composite event are a patient's medical symptom or a baseball player hitting a home run, caused distinct temporal orderings of patient vitals and player movements respectively. Such salient composite events are provided as labels in temporal datasets and most works optimize models to predict these composite event labels directly. We focus uncovering the underlying atomic events and their relations that lead to the composite events within a noisy temporal data setting. We propose Neural Temporal Logic Programming (Neural TLP) which first learns implicit temporal relations between atomic events and then lifts logic rules for composite events, given only the composite events labels for supervision. This is done through efficiently searching through the combinatorial space of all temporal logic rules in an end-to-end differentiable manner. We evaluate our method on video and on healthcare data where it outperforms the baseline methods for rule discovery. }, howpublished = {https://openreview.net/forum?id=i7h4M45tU8}, keywords = {activity recognition, arXiv, machine learning, openreview}, pubstate = {published}, tppubtype = {techreport} } @inproceedings{2021-Frey-ASTNVE, title = {Automatic Style Transfer for Non-Linear Video Editing}, author = {Nathan Frey and Peggy Chi and Weilong Yang and Irfan Essa}, url = {https://arxiv.org/abs/2105.06988 https://research.google/pubs/pub50449/}, doi = {10.48550/arXiv.2105.06988}, year = {2021}, date = {2021-06-01}, urldate = {2021-06-01}, booktitle = {Proceedings of CVPR Workshop on AI for Content Creation (AICC)}, keywords = {computational video, CVPR, google, video editing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2021-Piergiovanni-UDAIV, title = {Unsupervised Discovery of Actions in Instructional Videos}, author = {AJ Piergiovanni and Anelia Angelova and Michael S. Ryoo and Irfan Essa}, url = {https://arxiv.org/abs/2106.14733 https://www.bmvc2021-virtualconference.com/assets/papers/0773.pdf}, doi = { https://doi.org/10.48550/arXiv.2106.14733}, year = {2021}, date = {2021-06-01}, urldate = {2021-06-01}, booktitle = {British Machine Vision Conference (BMVC)}, number = {arXiv:2106.14733}, abstract = {In this paper we address the problem of automatically discovering atomic actions in unsupervised manner from instructional videos. Instructional videos contain complex activities and are a rich source of information for intelligent agents, such as, autonomous robots or virtual assistants, which can, for example, automatically `read' the steps from an instructional video and execute them. However, videos are rarely annotated with atomic activities, their boundaries or duration. We present an unsupervised approach to learn atomic actions of structured human tasks from a variety of instructional videos. We propose a sequential stochastic autoregressive model for temporal segmentation of videos, which learns to represent and discover the sequential relationship between different atomic actions of the task, and which provides automatic and unsupervised self-labeling for videos. Our approach outperforms the state-of-the-art unsupervised methods with large margins. We will open source the code. }, keywords = {activity recognition, computational video, computer vision, google}, pubstate = {published}, tppubtype = {inproceedings} } @article{2021-Haresamudram-CPCHAR, title = {Contrastive Predictive Coding for Human Activity Recognition}, author = {Harish Haresamudram and Irfan Essa and Thomas Ploetz}, url = {https://doi.org/10.1145/3463506 https://arxiv.org/abs/2012.05333}, doi = {10.1145/3463506}, year = {2021}, date = {2021-06-01}, urldate = {2021-06-01}, booktitle = {Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies}, journal = {Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies}, volume = {5}, number = {2}, pages = {1--26}, abstract = {Feature extraction is crucial for human activity recognition (HAR) using body-worn movement sensors. Recently, learned representations have been used successfully, offering promising alternatives to manually engineered features. Our work focuses on effective use of small amounts of labeled data and the opportunistic exploitation of unlabeled data that are straightforward to collect in mobile and ubiquitous computing scenarios. We hypothesize and demonstrate that explicitly considering the temporality of sensor data at representation level plays an important role for effective HAR in challenging scenarios. We introduce the Contrastive Predictive Coding (CPC) framework to human activity recognition, which captures the long-term temporal structure of sensor data streams. Through a range of experimental evaluations on real-life recognition tasks, we demonstrate its effectiveness for improved HAR. CPC-based pre-training is self-supervised, and the resulting learned representations can be integrated into standard activity chains. It leads to significantly improved recognition performance when only small amounts of labeled training data are available, thereby demonstrating the practical value of our approach.}, keywords = {activity recognition, IMWUT, machine learning, ubiquitous computing}, pubstate = {published}, tppubtype = {article} } @inproceedings{2021-Truong-AGTHTFIMV, title = {Automatic Generation of Two-Level Hierarchical Tutorials from Instructional Makeup Videos}, author = {Anh Truong and Peggy Chi and David Salesin and Irfan Essa and Maneesh Agrawala}, url = {https://dl.acm.org/doi/10.1145/3411764.3445721 https://research.google/pubs/pub50007/ http://anhtruong.org/makeup_breakdown/}, doi = {10.1145/3411764.3445721}, year = {2021}, date = {2021-05-01}, urldate = {2021-05-01}, booktitle = {ACM CHI Conference on Human factors in Computing Systems}, abstract = {We present a multi-modal approach for automatically generating hierarchical tutorials from instructional makeup videos. Our approach is inspired by prior research in cognitive psychology, which suggests that people mentally segment procedural tasks into event hierarchies, where coarse-grained events focus on objects while fine-grained events focus on actions. In the instructional makeup domain, we find that objects correspond to facial parts while fine-grained steps correspond to actions on those facial parts. Given an input instructional makeup video, we apply a set of heuristics that combine computer vision techniques with transcript text analysis to automatically identify the fine-level action steps and group these steps by facial part to form the coarse-level events. We provide a voice-enabled, mixed-media UI to visualize the resulting hierarchy and allow users to efficiently navigate the tutorial (e.g., skip ahead, return to previous steps) at their own pace. Users can navigate the hierarchy at both the facial-part and action-step levels using click-based interactions and voice commands. We demonstrate the effectiveness of segmentation algorithms and the resulting mixed-media UI on a variety of input makeup videos. A user study shows that users prefer following instructional makeup videos in our mixed-media format to the standard video UI and that they find our format much easier to navigate.}, keywords = {CHI, computational video, google, human-computer interaction, video summarization}, pubstate = {published}, tppubtype = {inproceedings} } @techreport{2021-Scarafoni-PPLANBSAP, title = {PLAN-B: Predicting Likely Alternative Next Best Sequences for Action Prediction}, author = {Dan Scarafoni and Irfan Essa and Thomas Ploetz}, url = {https://arxiv.org/abs/2103.15987}, doi = {10.48550/arXiv.2103.15987}, year = {2021}, date = {2021-03-01}, urldate = {2021-03-01}, journal = {arXiv}, number = {arXiv:2103.15987}, abstract = {Action prediction focuses on anticipating actions before they happen. Recent works leverage probabilistic approaches to describe future uncertainties and sample future actions. However, these methods cannot easily find all alternative predictions, which are essential given the inherent unpredictability of the future, and current evaluation protocols do not measure a system's ability to find such alternatives. We re-examine action prediction in terms of its ability to predict not only the top predictions, but also top alternatives with the accuracy@k metric. In addition, we propose Choice F1: a metric inspired by F1 score which evaluates a prediction system's ability to find all plausible futures while keeping only the most probable ones. To evaluate this problem, we present a novel method, Predicting the Likely Alternative Next Best, or PLAN-B, for action prediction which automatically finds the set of most likely alternative futures. PLAN-B consists of two novel components: (i) a Choice Table which ensures that all possible futures are found, and (ii) a "Collaborative" RNN system which combines both action sequence and feature information. We demonstrate that our system outperforms state-of-the-art results on benchmark datasets. }, keywords = {activity recognition, arXiv, computer vision}, pubstate = {published}, tppubtype = {techreport} } @inproceedings{2021-Cartillier-SMBASRFEV, title = {Semantic MapNet: Building Allocentric SemanticMaps and Representations from Egocentric Views}, author = {Vincent Cartillier and Zhile Ren and Neha Jain and Stefan Lee and Irfan Essa and Dhruv Batra}, url = {https://arxiv.org/abs/2010.01191 https://vincentcartillier.github.io/smnet.html https://ojs.aaai.org/index.php/AAAI/article/view/16180/15987}, doi = {10.48550/arXiv.2010.01191}, year = {2021}, date = {2021-02-01}, urldate = {2021-02-01}, booktitle = {Proceedings of American Association of Artificial Intelligence Conference (AAAI)}, publisher = {AAAI}, abstract = {We study the task of semantic mapping -- specifically, an embodied agent (a robot or an egocentric AI assistant) is given a tour of a new environment and asked to build an allocentric top-down semantic map (`what is where?') from egocentric observations of an RGB-D camera with known pose (via localization sensors). Importantly, our goal is to build neural episodic memories and spatio-semantic representations of 3D spaces that enable the agent to easily learn subsequent tasks in the same space -- navigating to objects seen during the tour (`Find chair') or answering questions about the space (`How many chairs did you see in the house?'). Towards this goal, we present Semantic MapNet (SMNet), which consists of: (1) an Egocentric Visual Encoder that encodes each egocentric RGB-D frame, (2) a Feature Projector that projects egocentric features to appropriate locations on a floor-plan, (3) a Spatial Memory Tensor of size floor-plan length × width × feature-dims that learns to accumulate projected egocentric features, and (4) a Map Decoder that uses the memory tensor to produce semantic top-down maps. SMNet combines the strengths of (known) projective camera geometry and neural representation learning. On the task of semantic mapping in the Matterport3D dataset, SMNet significantly outperforms competitive baselines by 4.01-16.81% (absolute) on mean-IoU and 3.81-19.69% (absolute) on Boundary-F1 metrics. Moreover, we show how to use the spatio-semantic allocentric representations build by SMNet for the task of ObjectNav and Embodied Question Answering.}, keywords = {AAAI, AI, embodied agents, first-person vision}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2020-Kumar-EMDAOTNM, title = {Estimating Mass Distribution of Articulated Objects through Non-prehensile Manipulation}, author = {Niranjan Kumar and Irfan Essa and Sehoon Ha and C. Karen Liu}, url = {https://orlrworkshop.github.io/program/orlr_25.html http://arxiv.org/abs/1907.03964 https://www.kniranjankumar.com/projects/1_mass_prediction https://www.youtube.com/watch?v=o3zBdVWvWZw https://kniranjankumar.github.io/assets/pdf/Estimating_Mass_Distribution_of_Articulated_Objects_using_Non_prehensile_Manipulation.pdf}, year = {2020}, date = {2020-12-01}, urldate = {2020-12-01}, booktitle = {Neural Information Processing Systems (NeurIPS) Workshop on Object Representations for Learning and Reasoning}, organization = {NeurIPS}, abstract = {We explore the problem of estimating the mass distribution of an articulated object by an interactive robotic agent. Our method predicts the mass distribution of an object by using limited sensing and actuating capabilities of a robotic agent that is interacting with the object. We are inspired by the role of exploratory play in human infants. We take the combined approach of supervised and reinforcement learning to train an agent that learns to strategically interact with the object to estimate the object's mass distribution. Our method consists of two neural networks: (i) the policy network which decides how to interact with the object, and (ii) the predictor network that estimates the mass distribution given a history of observations and interactions. Using our method, we train a robotic arm to estimate the mass distribution of an object with moving parts (e.g. an articulated rigid body system) by pushing it on a surface with unknown friction properties. We also demonstrate how our training from simulations can be transferred to real hardware using a small amount of real-world data for fine-tuning. We use a UR10 robot to interact with 3D printed articulated chains with varying mass distributions and show that our method significantly outperforms the baseline system that uses random pushes to interact with the object.}, howpublished = {arXiv preprint arXiv:1907.03964}, keywords = {reinforcement learning, robotics}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2020-Chi-AVCFP, title = {Automatic Video Creation From a Web Page}, author = {Peggy Chi and Zheng Sun and Katrina Panovich and Irfan Essa}, url = {https://dl.acm.org/doi/abs/10.1145/3379337.3415814 https://research.google/pubs/pub49618/ https://ai.googleblog.com/2020/10/experimenting-with-automatic-video.html https://www.youtube.com/watch?v=3yFYc-Wet8k}, doi = {10.1145/3379337.3415814}, year = {2020}, date = {2020-10-01}, urldate = {2020-10-01}, booktitle = {Proceedings of the 33rd Annual ACM Symposium on User Interface Software and Technology}, pages = {279--292}, organization = {ACM CHI}, abstract = {Creating marketing videos from scratch can be challenging, especially when designing for multiple platforms with different viewing criteria. We present URL2Video, an automatic approach that converts a web page into a short video given temporal and visual constraints. URL2Video captures quality materials and design styles extracted from a web page, including fonts, colors, and layouts. Using constraint programming, URL2Video's design engine organizes the visual assets into a sequence of shots and renders to a video with user-specified aspect ratio and duration. Creators can review the video composition, modify constraints, and generate video variation through a user interface. We learned the design process from designers and compared our automatically generated results with their creation through interviews and an online survey. The evaluation shows that URL2Video effectively extracted design elements from a web page and supported designers by bootstrapping the video creation process.}, keywords = {computational video, google, human-computer interaction, UIST, video editing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2020-Haresamudram-MRBSHAR, title = {Masked reconstruction based self-supervision for human activity recognition}, author = {Harish Haresamudram and Apoorva Beedu and Varun Agrawal and Patrick L Grady and Irfan Essa and Judy Hoffman and Thomas Plötz}, url = {https://dl.acm.org/doi/10.1145/3410531.3414306 https://harkash.github.io/publication/masked-reconstruction https://arxiv.org/abs/2202.12938}, doi = {10.1145/3410531.3414306}, year = {2020}, date = {2020-09-01}, urldate = {2020-09-01}, booktitle = {Proceedings of the International Symposium on Wearable Computers (ISWC)}, pages = {45--49}, abstract = {The ubiquitous availability of wearable sensing devices has rendered large scale collection of movement data a straightforward endeavor. Yet, annotation of these data remains a challenge and as such, publicly available datasets for human activity recognition (HAR) are typically limited in size as well as in variability, which constrains HAR model training and effectiveness. We introduce masked reconstruction as a viable self-supervised pre-training objective for human activity recognition and explore its effectiveness in comparison to state-of-the-art unsupervised learning techniques. In scenarios with small labeled datasets, the pre-training results in improvements over end-to-end learning on two of the four benchmark datasets. This is promising because the pre-training objective can be integrated "as is" into state-of-the-art recognition pipelines to effectively facilitate improved model robustness, and thus, ultimately, leading to better recognition performance. }, keywords = {activity recognition, ISWC, machine learning, wearable computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2020-Lee-NDNGLGWC, title = {Neural Design Network: Graphic Layout Generation with Constraints}, author = {Hsin-Ying Lee and Lu Jiang and Irfan Essa and Madison Le and Haifeng Gong and Ming-Hsuan Yang and Weilong Yang}, url = {https://arxiv.org/abs/1912.09421 https://rdcu.be/c7sqw}, doi = {10.1007/978-3-030-58580-8_29}, year = {2020}, date = {2020-08-01}, urldate = {2020-08-01}, booktitle = {Proceedings of European Conference on Computer Vision (ECCV)}, keywords = {computer vision, content creation, ECCV, generative media, google}, pubstate = {published}, tppubtype = {inproceedings} } @patent{2020-Pantofaru-SMDCGUFPD, title = {Systems and methods for directing content generation using a first-person point-of-view device.}, author = {Caroline Pantofaru and Vinay Bettadapura and Krishna Bharat and Irfan Essa}, url = {https://patents.google.com/patent/US10721439}, year = {2020}, date = {2020-07-21}, urldate = {2020-07-01}, publisher = {(US Patent # 10721439)}, abstract = {A method for personalizing a content item using captured footage is disclosed. The method includes receiving a first video feed from a first camera, wherein the first camera is designated as a source camera for capturing an event during a first time duration. The method also includes receiving data from a second camera, and determining, based on the received data from the second camera, that an action was performed using the second camera, the action being indicative of a region of interest (ROI) of the user of the second camera occurring within a second time duration. The method further includes designating the second camera as the source camera for capturing the event during the second time duration. }, howpublished = {US Patent # 10721439}, keywords = {computer vision, google, patents}, pubstate = {published}, tppubtype = {patent} } @inproceedings{2020-Chi-IVDPSS, title = {Interactive Visual Description of a Web Page for Smart Speakers}, author = {Peggy Chi and Irfan Essa}, url = {https://research.google/pubs/pub49441/ http://www.speechinteraction.org/CHI2020/programme.html}, year = {2020}, date = {2020-05-01}, urldate = {2020-05-01}, booktitle = {Proceedings of ACM CHI Workshop, CUI@CHI: Mapping Grand Challenges for the Conversational User Interface Community}, address = {Honolulu, Hawaii, USA}, abstract = {Smart speakers are becoming ubiquitous for accessing lightweight information using speech. While these devices are powerful for question answering and service operations using voice commands, it is challenging to navigate content of rich formats–including web pages–that are consumed by mainstream computing devices. We conducted a comparative study with 12 participants that suggests and motivates the use of a narrative voice output of a web page as being easier to follow and comprehend than a conventional screen reader. We are developing a tool that automatically narrates web documents based on their visual structures with interactive prompts. We discuss the design challenges for a conversational agent to intelligently select content for a more personalized experience, where we hope to contribute to the CUI workshop and form a discussion for future research. }, keywords = {accessibility, CHI, google, human-computer interaction}, pubstate = {published}, tppubtype = {inproceedings} } @patent{2020-Hickson-CLNN, title = {Category learning neural networks}, author = {Steven Hickson and Anelia Angelova and Irfan Essa and Rahul Sukthankar}, url = {https://patents.google.com/patent/US10635979}, year = {2020}, date = {2020-04-28}, urldate = {2020-04-28}, publisher = {(US Patent # 10635979)}, abstract = {Methods, systems, and apparatus, including computer programs encoded on a computer storage medium, for determining a clustering of images into a plurality of semantic categories. In one aspect, a method comprises: training a categorization neural network, comprising, at each of a plurality of iterations: processing an image depicting an object using the categorization neural network to generate (i) a current prediction for whether the image depicts an object or a background region, and (ii) a current embedding of the image; determining a plurality of current cluster centers based on the current values of the categorization neural network parameters, wherein each cluster center represents a respective semantic category; and determining a gradient of an objective function that includes a classification loss and a clustering loss, wherein the clustering loss depends on a similarity between the current embedding of the image and the current cluster centers. }, howpublished = {US Patent #10635979}, keywords = {google, machine learning, patents}, pubstate = {published}, tppubtype = {patent} } @inproceedings{2020-Wijmans-DDSPN, title = {Decentralized Distributed PPO: Solving PointGoal Navigation}, author = {Erik Wijmans and Abhishek Kadian and Ari Morcos and Stefan Lee and Irfan Essa and Devi Parikh and Manolis Savva and Dhruv Batra}, url = {https://arxiv.org/abs/1911.00357 https://paperswithcode.com/paper/decentralized-distributed-ppo-solving}, year = {2020}, date = {2020-04-01}, urldate = {2020-04-01}, booktitle = {Proceedings of International Conference on Learning Representations (ICLR)}, abstract = {We present Decentralized Distributed Proximal Policy Optimization (DD-PPO), a method for distributed reinforcement learning in resource-intensive simulated environments. DD-PPO is distributed (uses multiple machines), decentralized (lacks a centralized server), and synchronous (no computation is ever stale), making it conceptually simple and easy to implement. In our experiments on training virtual robots to navigate in Habitat-Sim, DD-PPO exhibits near-linear scaling -- achieving a speedup of 107x on 128 GPUs over a serial implementation. We leverage this scaling to train an agent for 2.5 Billion steps of experience (the equivalent of 80 years of human experience) -- over 6 months of GPU-time training in under 3 days of wall-clock time with 64 GPUs. This massive-scale training not only sets the state of art on Habitat Autonomous Navigation Challenge 2019, but essentially solves the task --near-perfect autonomous navigation in an unseen environment without access to a map, directly from an RGB-D camera and a GPS+Compass sensor. Fortuitously, error vs computation exhibits a power-law-like distribution; thus, 90% of peak performance is obtained relatively early (at 100 million steps) and relatively cheaply (under 1 day with 8 GPUs). Finally, we show that the scene understanding and navigation policies learned can be transferred to other navigation tasks -- the analog of ImageNet pre-training + task-specific fine-tuning for embodied AI. Our model outperforms ImageNet pre-trained CNNs on these transfer tasks and can serve as a universal resource (all models and code are publicly available).}, keywords = {embodied agents, ICLR, navigation, systems for ML}, pubstate = {published}, tppubtype = {inproceedings} } @techreport{2020-Wijmans-AVRENT, title = {Analyzing Visual Representations in Embodied Navigation Tasks}, author = {Erik Wijmans and Julian Straub and Dhruv Batra and Irfan Essa and Judy Hoffman and Ari Morcos}, url = {https://arxiv.org/abs/2003.05993 https://arxiv.org/pdf/2003.05993}, doi = {10.48550/arXiv.2003.05993}, year = {2020}, date = {2020-03-01}, urldate = {2020-03-01}, journal = {arXiv}, number = {arXiv:2003.05993}, abstract = {Recent advances in deep reinforcement learning require a large amount of training data and generally result in representations that are often over specialized to the target task. In this work, we present a methodology to study the underlying potential causes for this specialization. We use the recently proposed projection weighted Canonical Correlation Analysis (PWCCA) to measure the similarity of visual representations learned in the same environment by performing different tasks. We then leverage our proposed methodology to examine the task dependence of visual representations learned on related but distinct embodied navigation tasks. Surprisingly, we find that slight differences in task have no measurable effect on the visual representation for both SqueezeNet and ResNet architectures. We then empirically demonstrate that visual representations learned on one task can be effectively transferred to a different task.}, howpublished = {arXiv:2003.05993}, keywords = {arXiv, embodied agents, navigation}, pubstate = {published}, tppubtype = {techreport} } @patent{2019-Starner-OOIVS, title = {Object occlusion to initiate a visual search}, author = {Thad Eugene Starner and Irfan Essa and Hayes Solos Raffle and Daniel Aminzade}, url = {https://patents.google.com/patent/US10437882}, year = {2019}, date = {2019-10-01}, urldate = {2019-10-01}, publisher = {(US Patent # 10437882)}, abstract = {Methods, systems, and apparatus, including computer programs encoded on computer storage media, for video segmentation. One of the methods includes receiving a digital video; performing hierarchical graph-based video segmentation on at least one frame of the digital video to generate a boundary representation for the at least one frame; generating a vector representation from the boundary representation for the at least one frame of the digital video, wherein generating the vector representation includes generating a polygon composed of at least three vectors, wherein each vector comprises two vertices connected by a line segment, from a boundary in the boundary representation; linking the vector representation to the at least one frame of the digital video; and storing the vector representation with the at least one frame of the digital video. }, howpublished = {US Patent # 10437882}, note = {US Patent 10,437,882}, keywords = {computer vision, google, patents}, pubstate = {published}, tppubtype = {patent} } @inproceedings{2019-Hickson-FFLSRSNP, title = {Floors are Flat: Leveraging Semantics for Real-Time Surface Normal Prediction}, author = {Steven Hickson and Karthik Raveendran and Alireza Fathi and Kevin Murphy and Irfan Essa}, url = {https://arxiv.org/abs/1906.06792 https://openaccess.thecvf.com/content_ICCVW_2019/papers/GMDL/Hickson_Floors_are_Flat_Leveraging_Semantics_for_Real-Time_Surface_Normal_Prediction_ICCVW_2019_paper.pdf}, doi = {10.1109/ICCVW.2019.00501}, year = {2019}, date = {2019-10-01}, urldate = {2019-10-01}, booktitle = {IEEE International Conference on Computer Vision (ICCV) Workshop on Geometry Meets Deep Learning}, abstract = {We propose 4 insights that help to significantly improve the performance of deep learning models that predict surface normals and semantic labels from a single RGB image. These insights are: (1) denoise the "ground truth" surface normals in the training set to ensure consistency with the semantic labels; (2) concurrently train on a mix of real and synthetic data, instead of pretraining on synthetic and fine-tuning on real; (3) jointly predict normals and semantics using a shared model, but only backpropagate errors on pixels that have valid training labels; (4) slim down the model and use grayscale instead of color inputs. Despite the simplicity of these steps, we demonstrate consistently improved state of the art results on several datasets, using a model that runs at 12 fps on a standard mobile phone. }, howpublished = {arXiv preprint arXiv:1906.06792}, keywords = {computer vision, google, ICCV}, pubstate = {published}, tppubtype = {inproceedings} } @article{2019-Ghogawala-AITLS, title = {Artificial Intelligence for the Treatment of Lumbar Spondylolisthesis}, author = {Zoher Ghogawala and Melissa Dunbar and Irfan Essa}, url = {http://www.sciencedirect.com/science/article/pii/S1042368019300257 https://pubmed.ncbi.nlm.nih.gov/31078239/}, doi = {10.1016/j.nec.2019.02.012}, issn = {1042-3680}, year = {2019}, date = {2019-07-01}, urldate = {2019-07-01}, journal = {Neurosurgery Clinics of North America}, volume = {30}, number = {3}, pages = {383 - 389}, abstract = {Multiple registries are currently collecting patient-specific data on lumbar spondylolisthesis including outcomes data. The collection of imaging diagnostics data along with comparative outcomes data following decompression versus decompression and fusion treatments for degenerative spondylolisthesis represents an enormous opportunity for modern machine-learning analytics research. }, note = {Lumbar Spondylolisthesis}, keywords = {AI, computational health, Predictive analytics}, pubstate = {published}, tppubtype = {article} } @article{2019-Zia-NESARMUTEM, title = {Novel evaluation of surgical activity recognition models using task-based efficiency metrics}, author = {Aneeq Zia and Liheng Guo and Linlin Zhou and Irfan Essa and Anthony Jarc}, url = {https://www.ncbi.nlm.nih.gov/pubmed/31267333}, doi = {10.1007/s11548-019-02025-w}, year = {2019}, date = {2019-07-01}, urldate = {2019-07-01}, journal = {International Journal of Computer Assisted Radiology and Surgery}, abstract = {PURPOSE: Surgical task-based metrics (rather than entire procedure metrics) can be used to improve surgeon training and, ultimately, patient care through focused training interventions. Machine learning models to automatically recognize individual tasks or activities are needed to overcome the otherwise manual effort of video review. Traditionally, these models have been evaluated using frame-level accuracy. Here, we propose evaluating surgical activity recognition models by their effect on task-based efficiency metrics. In this way, we can determine when models have achieved adequate performance for providing surgeon feedback via metrics from individual tasks. METHODS: We propose a new CNN-LSTM model, RP-Net-V2, to recognize the 12 steps of robotic-assisted radical prostatectomies (RARP). We evaluated our model both in terms of conventional methods (e.g., Jaccard Index, task boundary accuracy) as well as novel ways, such as the accuracy of efficiency metrics computed from instrument movements and system events. RESULTS: Our proposed model achieves a Jaccard Index of 0.85 thereby outperforming previous models on RARP. Additionally, we show that metrics computed from tasks automatically identified using RP-Net-V2 correlate well with metrics from tasks labeled by clinical experts. CONCLUSION: We demonstrate that metrics-based evaluation of surgical activity recognition models is a viable approach to determine when models can be used to quantify surgical efficiencies. We believe this approach and our results illustrate the potential for fully automated, postoperative efficiency reports.}, keywords = {activity assessment, activity recognition, surgical training}, pubstate = {published}, tppubtype = {article} } @article{2019-Ghogawala-LSMRDAI, title = {Lumbar spondylolisthesis: modern registries and the development of artificial intelligence}, author = {Zoher Ghogawala and Melissa Dunbar and Irfan Essa}, doi = {10.3171/2019.2.SPINE18751}, year = {2019}, date = {2019-06-01}, urldate = {2019-06-01}, journal = {Journal of Neurosurgery: Spine (JNSPG 75th Anniversary Invited Review Article)}, volume = {30}, number = {6}, pages = {729-735}, keywords = {AI, computational health, Predictive analytics}, pubstate = {published}, tppubtype = {article} } @inproceedings{2019-Wijmans-EQAPEWPCP, title = {Embodied Question Answering in Photorealistic Environments With Point Cloud Perception}, author = {Erik Wijmans and Samyak Datta and Oleksandr Maksymets and Abhishek Das and Georgia Gkioxari and Stefan Lee and Irfan Essa and Devi Parikh and Dhruv Batra}, doi = {10.1109/CVPR.2019.00682}, year = {2019}, date = {2019-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, keywords = {computer vision, CVPR, vision & language}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2019-Alamri-AVSD, title = {Audio Visual Scene-Aware Dialog}, author = {Huda Alamri and Vincent Cartillier and Abhishek Das and Jue Wang and Anoop Cherian and Irfan Essa and Dhruv Batra and Tim K. Marks and Chiori Hori and Peter Anderson and Stefan Lee and Devi Parikh}, url = {https://openaccess.thecvf.com/content_CVPR_2019/papers/Alamri_Audio_Visual_Scene-Aware_Dialog_CVPR_2019_paper.pdf https://video-dialog.com/ https://arxiv.org/abs/1901.09107}, doi = {10.1109/CVPR.2019.00774}, year = {2019}, date = {2019-06-01}, urldate = {2019-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, abstract = {We introduce the task of scene-aware dialog. Our goal is to generate a complete and natural response to a question about a scene, given video and audio of the scene and the history of previous turns in the dialog. To answer successfully, agents must ground concepts from the question in the video while leveraging contextual cues from the dialog history. To benchmark this task, we introduce the Audio Visual Scene-Aware Dialog (AVSD) Dataset. For each of more than 11,000 videos of human actions from the Charades dataset, our dataset contains a dialog about the video, plus a final summary of the video by one of the dialog participants. We train several baseline systems for this task and evaluate the performance of the trained models using both qualitative and quantitative metrics. Our results indicate that models must utilize all the available inputs (video, audio, question, and dialog history) to perform best on this dataset. }, keywords = {computational video, computer vision, CVPR, embodied agents, vision & language}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2019-Drnach-DPMIEHGD, title = {A Data-Driven Predictive Model of Individual-Specific Effects of FES on Human Gait Dynamics}, author = {Luke Drnach and J. L. Allen and Irfan Essa and Lena H. Ting}, url = {https://neuromechanicslab.emory.edu/documents/publications-docs/Drnach%20et%20al%20Data%20Driven%20Gait%20Model%20ICRA%202019.pdf}, doi = {10.1109/ICRA.2019.8794304}, year = {2019}, date = {2019-05-01}, urldate = {2019-05-01}, booktitle = {Proceedings International Conference on Robotics and Automation (ICRA)}, keywords = {gait analysis, robotics}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2019-Ahsan-VJULSCVAR, title = {Video Jigsaw: Unsupervised Learning of Spatiotemporal Context for Video Action Recognition}, author = {Unaiza Ahsan and Rishi Madhok and Irfan Essa}, url = {https://ieeexplore.ieee.org/abstract/document/8659002}, doi = {10.1109/WACV.2019.00025}, issn = {1550-5790}, year = {2019}, date = {2019-01-01}, urldate = {2019-01-01}, booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)}, pages = {179-189}, keywords = {activity recognition, computer vision, machine learning, WACV}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2019-Hickson-ECFEUEC, title = {Eyemotion: Classifying Facial Expressions in VR Using Eye-Tracking Cameras}, author = {S. Hickson and N. Dufour and A. Sud and V. Kwatra and I. Essa}, url = {https://ieeexplore.ieee.org/document/8658392 https://ai.google/research/pubs/pub46291}, doi = {10.1109/WACV.2019.00178}, issn = {1550-5790}, year = {2019}, date = {2019-01-01}, urldate = {2019-01-01}, booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)}, pages = {1626-1635}, abstract = {One of the main challenges of social interaction in virtual reality settings is that head-mounted displays occlude a large portion of the face, blocking facial expressions and thereby restricting social engagement cues among users. We present an algorithm to automatically infer expressions by analyzing only a partially occluded face while the user is engaged in a virtual reality experience. Specifically, we show that images of the user's eyes captured from an IR gaze-tracking camera within a VR headset are sufficient to infer a subset of facial expressions without the use of any fixed external camera. Using these inferences, we can generate dynamic avatars in real-time which function as an expressive surrogate for the user. We propose a novel data collection pipeline as well as a novel approach for increasing CNN accuracy via personalization. Our results show a mean accuracy of 74% (F1 of 0.73) among 5 'emotive' expressions and a mean accuracy of 70% (F1 of 0.68) among 10 distinct facial action units, outperforming human raters. }, keywords = {audio-video fusion, face & gesture, face processing, multimodal interfaces, WACV}, pubstate = {published}, tppubtype = {inproceedings} } @patent{2018-Essa-VRVS, title = {Vector representation for video segmentation}, author = {Irfan Essa and Vivek Kwatra and Matthias Grundmann}, url = {https://patents.google.com/patent/US20180350131}, year = {2018}, date = {2018-12-06}, urldate = {2018-12-01}, publisher = {(US Patent Application # 14/587,420)}, howpublished = {US Patent # US20180350131A1}, note = {US Patent Application 14/587,420}, keywords = {computer vision, google, patents}, pubstate = {published}, tppubtype = {patent} } @patent{2018-Pantofaru-SMDCGUFPD, title = {Systems and methods for directing content generation using a first-person point-of-view device}, author = {Caroline Pantofaru and Vinay Bettadapura and Krishna Bharat and Irfan Essa}, url = {https://patents.google.com/patent/US10110850}, year = {2018}, date = {2018-10-23}, urldate = {2018-10-01}, publisher = {(US Patent #10110850)}, abstract = {A method for localizing the attention of a user of a first-person point-of-view (FPPOV) device is disclosed. The method includes receiving data from an FPPOV device, the data being indicative of a first region-of-interest (ROI) of an event for a first time duration and a second ROI of the event for a second time duration. The method further include determining that a first camera from a plurality of cameras best captures the first ROI during the first time duration, and determining that a second camera from the plurality of cameras best captures the second ROI during the second time duration. }, howpublished = {US Patent # US10110850B1}, note = {US Patent 10,110,850}, keywords = {computer vision, google, patents}, pubstate = {published}, tppubtype = {patent} } @techreport{2018-Balloch-USSRPUSDFT, title = {Unbiasing Semantic Segmentation For Robot Perception using Synthetic Data Feature Transfer}, author = {Jonathan C Balloch and Varun Agrawal and Irfan Essa and Sonia Chernova}, url = {https://doi.org/10.48550/arXiv.1809.03676}, doi = {10.48550/arXiv.1809.03676}, year = {2018}, date = {2018-09-01}, urldate = {2018-09-01}, journal = {arXiv}, number = {arXiv:1809.03676}, abstract = {Robot perception systems need to perform reliable image segmentation in real-time on noisy, raw perception data. State-of-the-art segmentation approaches use large CNN models and carefully constructed datasets; however, these models focus on accuracy at the cost of real-time inference. Furthermore, the standard semantic segmentation datasets are not large enough for training CNNs without augmentation and are not representative of noisy, uncurated robot perception data. We propose improving the performance of real-time segmentation frameworks on robot perception data by transferring features learned from synthetic segmentation data. We show that pretraining real-time segmentation architectures with synthetic segmentation data instead of ImageNet improves fine-tuning performance by reducing the bias learned in pretraining and closing the textit{transfer gap} as a result. Our experiments show that our real-time robot perception models pretrained on synthetic data outperform those pretrained on ImageNet for every scale of fine-tuning data examined. Moreover, the degree to which synthetic pretraining outperforms ImageNet pretraining increases as the availability of robot data decreases, making our approach attractive for robotics domains where dataset collection is hard and/or expensive. }, howpublished = {arXiv:1809.03676}, keywords = {arXiv, robotics, scene understanding}, pubstate = {published}, tppubtype = {techreport} } @article{2018-Ahsan-VJULSCVAR, title = {Video Jigsaw: Unsupervised Learning of Spatiotemporal Context for Video Action Recognition}, author = {Unaiza Ahsan and Rishi Madhok and Irfan Essa}, year = {2018}, date = {2018-08-01}, journal = {arXiv}, number = {arXiv:1808.07507}, keywords = {activity recognition, computer vision, machine learning}, pubstate = {published}, tppubtype = {article} } @inproceedings{2018-Drnach-IGPFJKDWWSLDS, title = {Identifying Gait Phases from Joint Kinematics during Walking with Switched Linear Dynamical Systems*}, author = {Luke Drnach and Irfan Essa and Lena Ting}, url = {https://ieeexplore.ieee.org/document/8487216}, doi = {10.1109/BIOROB.2018.8487216}, issn = {2155-1782}, year = {2018}, date = {2018-08-01}, urldate = {2018-08-01}, booktitle = {IEEE International Conference on Biomedical Robotics and Biomechatronics (Biorob)}, pages = {1181-1186}, abstract = {Human-robot interaction (HRI) for gait rehabilitation would benefit from data-driven gait models that account for gait phases and gait dynamics. Here we address the current limitation in gait models driven by kinematic data, which do not model interlimb gait dynamics and have not been shown to precisely identify gait events. We used Switched Linear Dynamical Systems (SLDS) to model joint angle kinematic data from healthy individuals walking on a treadmill with normal gaits and with gaits perturbed by electrical stimulation. We compared the model-inferred gait phases to gait phases measured externally via a force plate. We found that SLDS models accounted for over 88% of the variation in each joint angle and labeled the joint kinematics with the correct gait phase with 84% precision on average. The transitions between hidden states matched measured gait events, with a median absolute difference of 25ms. To our knowledge, this is the first time that SLDS inferred gait phases have been validated by an external measure of gait, instead of against predefined gait phase durations. SLDS provide individual-specific representations of gait that incorporate both gait phases and gait dynamics. SLDS may be useful for developing control policies for HRI aimed at improving gait by allowing for changes in control to be precisely timed to different gait phases. }, keywords = {gait analysis, robotics}, pubstate = {published}, tppubtype = {inproceedings} } @techreport{2018-Hickson-OCLRWWS, title = {Object category learning and retrieval with weak supervision}, author = {Steven Hickson and Anelia Angelova and Irfan Essa and Rahul Sukthankar}, url = {https://arxiv.org/abs/1801.08985 https://arxiv.org/pdf/1801.08985}, doi = {10.48550/arXiv.1801.08985}, year = {2018}, date = {2018-07-01}, urldate = {2018-07-01}, journal = {arXiv}, number = {arXiv:1801.08985}, abstract = {We consider the problem of retrieving objects from image data and learning to classify them into meaningful semantic categories with minimal supervision. To that end, we propose a fully differentiable unsupervised deep clustering approach to learn semantic classes in an end-to-end fashion without individual class labeling using only unlabeled object proposals. The key contributions of our work are 1) a kmeans clustering objective where the clusters are learned as parameters of the network and are represented as memory units, and 2) simultaneously building a feature representation, or embedding, while learning to cluster it. This approach shows promising results on two popular computer vision datasets: on CIFAR10 for clustering objects, and on the more complex and challenging Cityscapes dataset for semantically discovering classes which visually correspond to cars, people, and bicycles. Currently, the only supervision provided is segmentation objectness masks, but this method can be extended to use an unsupervised objectness-based object generation mechanism which will make the approach completely unsupervised. }, howpublished = {arXiv:1801.08985}, keywords = {arXiv, computer vision, machine learning, object detection}, pubstate = {published}, tppubtype = {techreport} } @article{2018-Hori-EAVSDUMAVF, title = {End-to-End Audio Visual Scene-Aware Dialog using Multimodal Attention-Based Video Features}, author = {Chiori Hori and Huda Alamri and Jue Wang and Gordon Winchern and Takaaki Hori and Anoop Cherian and Tim K Marks and Vincent Cartillier and Raphael Gontijo Lopes and Abhishek Das and Irfan Essa and Dhruv Batra and Devi Parikh}, year = {2018}, date = {2018-06-01}, journal = {arXiv}, number = {arXiv:1806.08409}, keywords = {}, pubstate = {published}, tppubtype = {article} } @techreport{2018-Alamri-AVSDACD, title = {Audio Visual Scene-Aware Dialog (AVSD) Challenge at DSTC7}, author = {Huda Alamri and Vincent Cartillier and Raphael Gontijo Lopes and Abhishek Das and Jue Wang and Irfan Essa and Dhruv Batra and Devi Parikh and Anoop Cherian and Tim K Marks and Chiori Hori}, url = {https://video-dialog.com/ https://arxiv.org/abs/1806.00525}, doi = {10.48550/arXiv.1806.00525}, year = {2018}, date = {2018-06-01}, urldate = {2018-06-01}, journal = {arXiv}, number = {arXiv:1806.00525}, abstract = {Scene-aware dialog systems will be able to have conversations with users about the objects and events around them. Progress on such systems can be made by integrating state-of-the-art technologies from multiple research areas including end-to-end dialog systems visual dialog, and video description. We introduce the Audio Visual Scene Aware Dialog (AVSD) challenge and dataset. In this challenge, which is one track of the 7th Dialog System Technology Challenges (DSTC7) workshop1, the task is to build a system that generates responses in a dialog about an input video }, howpublished = {arXiv:1806.00525}, keywords = {arXiv, embodied agents, multimedia, vision & language}, pubstate = {published}, tppubtype = {techreport} } @article{2018-Zia-SARRRPUDL, title = {Surgical Activity Recognition in Robot-Assisted Radical Prostatectomy using Deep Learning}, author = {Aneeq Zia and Andrew Hung and Irfan Essa and Anthony Jarc}, year = {2018}, date = {2018-06-01}, journal = {arXiv}, number = {arXiv:1806.00466}, keywords = {}, pubstate = {published}, tppubtype = {article} } @article{2018-Zia-ASSART, title = {Automated surgical skill assessment in RMIS training}, author = {Aneeq Zia and Irfan Essa}, url = {https://link.springer.com/article/10.1007/s11548-018-1735-5}, year = {2018}, date = {2018-03-01}, journal = {International Journal of Computer Assisted Radiology and Surgery}, volume = {13}, number = {5}, pages = {731--739}, publisher = {Springer}, keywords = {}, pubstate = {published}, tppubtype = {article} } @inproceedings{2018-Uzun-RRCBLDS, title = {rtCaptcha: A Real-Time CAPTCHA Based Liveness Detection System}, author = {Erkam Uzun and Simon Pak Ho Chung and Irfan Essa and Wenke Lee}, year = {2018}, date = {2018-03-01}, booktitle = {Network and Distributed System Security Symposium (NDSS)}, keywords = {information security}, pubstate = {published}, tppubtype = {inproceedings} } @patent{2018-Grundmann-CCMERSDCSDVS, title = {Cascaded camera motion estimation, rolling shutter detection, and camera shake detection for video stabilization}, author = {Matthias Grundmann and Vivek Kwatra and Irfan Essa}, url = {https://patents.google.com/patent/US9888180}, year = {2018}, date = {2018-02-06}, urldate = {2018-02-01}, publisher = {(US Patent #9888180)}, howpublished = {US Patent # US9888180}, note = {US Patent 9,888,180}, keywords = {computer vision, google, patents}, pubstate = {published}, tppubtype = {patent} } @article{2018-Zia-VAMAASSA, title = {Video and accelerometer-based motion analysis for automated surgical skills assessment}, author = {Aneeq Zia and Yachna Sharma and Vinay Bettadapura and Eric L Sarin and Irfan Essa}, url = {https://link.springer.com/article/10.1007/s11548-018-1704-z}, doi = {10.1007/s11548-018-1704-z}, year = {2018}, date = {2018-01-01}, urldate = {2018-01-01}, journal = {International Journal of Computer Assisted Radiology and Surgery}, volume = {13}, number = {3}, pages = {443--455}, publisher = {Springer}, keywords = {activity assessment, activity recognition, IJCARS, surgical training}, pubstate = {published}, tppubtype = {article} } @article{2018-Castro-LDLFODV, title = {Let's Dance: Learning From Online Dance Videos}, author = {Daniel Castro and Steven Hickson and Patsorn Sangkloy and Bhavishya Mittal and Sean Dai and James Hays and Irfan Essa}, year = {2018}, date = {2018-01-01}, journal = {arXiv}, number = {arXiv:1801.07388}, keywords = {}, pubstate = {published}, tppubtype = {article} } @article{2018-Ahsan-DSARFVUGAN, title = {DiscrimNet: Semi-Supervised Action Recognition from Videos using Generative Adversarial Networks}, author = {Unaiza Ahsan and Chen Sun and Irfan Essa}, year = {2018}, date = {2018-01-01}, journal = {arXiv}, number = {arXiv:1801.07230}, keywords = {activity recognition, computer vision, machine learning}, pubstate = {published}, tppubtype = {article} } @inproceedings{2017-Shaban-OLSS, title = {One-Shot Learning for Semantic Segmentation}, author = {Amirreza Shaban and Shray Bansal and Zhen Liu and Irfan Essa and Byron Boots}, url = {http://www.bmva.org/bmvc/2017/papers/paper167/index.html}, doi = {10.5244/C.31.167}, year = {2017}, date = {2017-09-01}, booktitle = {British Machine Vision Conference (BMVC)}, keywords = {image segmentation, one-shot learning, semantic segmentation}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2017-Zia-VAMAASSA, title = {Video and Accelerometer-Based Motion Analysis for Automated Surgical Skills Assessment}, author = {Aneeq Zia and Yachna Sharma and Vinay Bettadapura and Eric Sarin and Irfan Essa}, year = {2017}, date = {2017-06-01}, urldate = {2017-06-01}, booktitle = {Information Processing in Computer-Assisted Interventions (IPCAI)}, keywords = {activity assessment, activity recognition, surgical training}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2017-Deeb-Swihart-SELLCSCI, title = {Selfie-Presentation in Everyday Life: A Large-Scale Characterization of Selfie Contexts on Instagram}, author = {Julia Deeb-Swihart and Christopher Polack and Eric Gilbert and Irfan Essa}, year = {2017}, date = {2017-05-01}, booktitle = {International AAAI Conference on Web and Social Media (ICWSM)}, organization = {AAAI}, keywords = {social computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2017-Ahsan-TUVAIISSE, title = {Towards Using Visual Attributes to Infer Image Sentiment Of Social Events}, author = {Unaiza Ahsan and Munmun De Choudhury and Irfan Essa}, url = {https://ieeexplore.ieee.org/abstract/document/7966013}, doi = {10.1109/IJCNN.2017.7966013}, year = {2017}, date = {2017-05-01}, urldate = {2017-05-01}, booktitle = {Proceedings of The International Joint Conference on Neural Networks}, publisher = {International Neural Network Society}, address = {Anchorage, Alaska, US}, abstract = {Widespread and pervasive adoption of smartphones has led to instant sharing of photographs that capture events ranging from mundane to life-altering happenings. We propose to capture sentiment information of such social event images leveraging their visual content. Our method extracts an intermediate visual representation of social event images based on the visual attributes that occur in the images going beyond sentiment-specific attributes. We map the top predicted attributes to sentiments and extract the dominant emotion associated with a picture of a social event. Unlike recent approaches, our method generalizes to a variety of social events and even to unseen events, which are not available at training time. We demonstrate the effectiveness of our approach on a challenging social event image dataset and our method outperforms state-of-the-art approaches for classifying complex event images into sentiments. }, keywords = {computational journalism, computer vision, IJNN, machine learning}, pubstate = {published}, tppubtype = {inproceedings} } @patent{2017-Essa-SAMFUMFTPEIDS, title = {System and method for utilizing motion fields to predict evolution in dynamic scenes}, author = {Irfan Essa and Matthias Grundmann and Jessica Hodgins and Kihwan Kim and Iain Matthews and Ariel Shamir}, url = {https://patents.google.com/patent/US9600760}, year = {2017}, date = {2017-03-21}, abstract = {Described herein are methods, systems, apparatuses and products for utilizing motion fields to predict evolution in dynamic scenes. One aspect provides for accessing active object position data including positioning information of a plurality of individual active objects; extracting a plurality of individual active object motions from the active object position data; constructing a motion field using the plurality of individual active object motions; and using the motion field to predict one or more points of convergence at one or more spatial locations that active objects are proceeding towards at a future point in time. Other embodiments are disclosed. }, howpublished = {US Patent #US9600760}, keywords = {computer vision, patents, sports visualization}, pubstate = {published}, tppubtype = {patent} } @inproceedings{2017-Ahsan-CERFIWTE, title = {Complex Event Recognition from Images with Few Training Examples}, author = {Unaiza Ahsan and Chen Sun and James Hays and Irfan Essa}, url = {https://arxiv.org/abs/1701.04769 https://www.computer.org/csdl/proceedings-article/wacv/2017/07926663/12OmNzZEAzy}, doi = {10.1109/WACV.2017.80}, year = {2017}, date = {2017-03-01}, urldate = {2017-03-01}, booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)}, abstract = {We propose to leverage concept-level representations for complex event recognition in photographs given limited training examples. We introduce a novel framework to discover event concept attributes from the web and use that to extract semantic features from images and classify them into social event categories with few training examples. Discovered concepts include a variety of objects, scenes, actions and event sub-types, leading to a discriminative and compact representation for event images. Web images are obtained for each discovered event concept and we use (pretrained) CNN features to train concept classifiers. Extensive experiments on challenging event datasets demonstrate that our proposed method outperforms several baselines using deep CNN features directly in classifying images into events with limited training examples. We also demonstrate that our method achieves the best overall accuracy on a dataset with unseen event categories using a single training example. }, keywords = {activity recognition, computer vision, machine learning, WACV}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2017-Thomaz-COADEA, title = {Challenges and Opportunities in Automated Detection of Eating Activity}, author = {Edison Thomaz and Irfan Essa and Gregory Abowd}, url = {https://link.springer.com/chapter/10.1007/978-3-319-51394-2_9}, doi = {10.1007/978-3-319-51394-2_9}, year = {2017}, date = {2017-01-01}, urldate = {2017-01-01}, booktitle = {Mobile Health}, pages = {151--174}, publisher = {Springer}, abstract = {Motivated by applications in nutritional epidemiology and food journaling, computing researchers have proposed numerous techniques for automating dietary monitoring over the years. Although progress has been made, a truly practical system that can automatically recognize what people eat in real-world settings remains elusive. Eating detection is a foundational element of automated dietary monitoring (ADM) since automatically recognizing when a person is eating is required before identifying what and how much is being consumed. Additionally, eating detection can serve as the basis for new types of dietary self-monitoring practices such as semi-automated food journaling.This chapter discusses the problem of automated eating detection and presents a variety of practical techniques for detecting eating activities in real-world settings. These techniques center on three sensing modalities: first-person images taken with wearable cameras, ambient sounds, and on-body inertial sensors [34–37]. The chapter begins with an analysis of how first-person images reflecting everyday experiences can be used to identify eating moments using two approaches: human computation and convolutional neural networks. Next, we present an analysis showing how certain sounds associated with eating can be recognized and used to infer eating activities. Finally, we introduce a method for detecting eating moments with on-body inertial sensors placed on the wrist. }, keywords = {activity recognition, computational health, ubiquitous computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2017-Thomaz-ESABEDWISW, title = {Exploring Symmetric and Asymmetric Bimanual Eating Detection with Inertial Sensors on the Wrist}, author = {Edison Thomaz and Abdelkareem Bedri and Temiloluwa Prioleau and Irfan Essa and Gregory Abowd}, doi = {10.1145/3089341.3089345}, year = {2017}, date = {2017-01-01}, urldate = {2017-01-01}, booktitle = {Proceedings of the 1st Workshop on Digital Biomarkers}, pages = {21--26}, organization = {ACM}, keywords = {activity recognition, ubiquitous computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2016-Zia-FDASTD, title = {Fine-tuning Deep Architectures for Surgical Tool Detection}, author = {Aneeq Zia and Daniel Castro and Irfan Essa}, url = {http://www.cc.gatech.edu/cpl/projects/deepm2cai/ https://www.cc.gatech.edu/cpl/projects/deepm2cai/paper.pdf}, year = {2016}, date = {2016-10-01}, urldate = {2016-10-01}, booktitle = {Workshop and Challenges on Modeling and Monitoring of Computer Assisted Interventions (M2CAI), Held in Conjunction with International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)}, address = {Athens, Greece}, abstract = {Understanding surgical workflow has been a key concern of the medical research community. One of the main advantages of surgical workflow detection is real time operating room (OR) scheduling. For hospitals, each minute of OR time is important in order to reduce cost and increase patient throughput. Traditional approaches in this field generally tackle the video analysis using hand crafted video features to facilitate the tool detection. Recently, Twinanda et al presented a CNN architecture ’EndoNet’ which outperformed previous methods for both surgical tool detection and surgical phase detection. Given the recent success of these networks, we present a study of various architectures coupled with a submission to the M2CAI Surgical Tool Detection challenge. We achieved a top-3 result for the M2CAI competition with a mAP of 37.6. }, keywords = {activity assessment, computer vision, MICCAI, surgical training}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2016-Bettadapura-LCCGBH, title = {Leveraging Contextual Cues for Generating Basketball Highlights}, author = {Vinay Bettadapura and Caroline Pantofaru and Irfan Essa}, url = {https://dl.acm.org/doi/10.1145/2964284.2964286 http://www.vbettadapura.com/highlights/basketball/index.htm}, doi = {10.1145/2964284.2964286}, year = {2016}, date = {2016-10-01}, urldate = {2016-10-01}, booktitle = {ACM International Conference on Multimedia (ACM-MM)}, organization = {ACM}, abstract = {The massive growth of sports videos has resulted in a need for automatic generation of sports highlights that are comparable in quality to the hand-edited highlights produced by broadcasters such as ESPN. Unlike previous works that mostly use audio-visual cues derived from the video, we propose an approach that additionally leverages contextual cues derived from the environment that the game is being played in. The contextual cues provide information about the excitement levels in the game, which can be ranked and selected to automatically produce high-quality basketball highlights. We introduce a new dataset of 25 NCAA games along with their play-by-play stats and the ground-truth excitement data for each basket. We explore the informativeness of five different cues derived from the video and from the environment through user studies. Our experiments show that for our study participants, the highlights produced by our system are comparable to the ones produced by ESPN for the same games.}, keywords = {ACM, ACMMM, activity recognition, computational video, computer vision, sports visualization, video summarization}, pubstate = {published}, tppubtype = {inproceedings} } @article{2016-Zia-AVASSTEMS, title = {Automated video-based assessment of surgical skills for training and evaluation in medical schools}, author = {Aneeq Zia and Yachna Sharma and Vinay Bettadapura and Eric Sarin and Thomas Ploetz and Mark Clements and Irfan Essa}, url = {http://link.springer.com/article/10.1007/s11548-016-1468-2 https://pubmed.ncbi.nlm.nih.gov/27567917/}, doi = {10.1007/s11548-016-1468-2}, year = {2016}, date = {2016-09-01}, urldate = {2016-09-01}, journal = {International Journal of Computer Assisted Radiology and Surgery}, volume = {11}, number = {9}, pages = {1623--1636}, publisher = {Springer Berlin Heidelberg}, abstract = {Routine evaluation of basic surgical skills in medical schools requires considerable time and effort from supervising faculty. For each surgical trainee, a supervisor has to observe the trainees in person. Alternatively, supervisors may use training videos, which reduces some of the logistical overhead. All these approaches however are still incredibly time consuming and involve human bias. In this paper, we present an automated system for surgical skills assessment by analyzing video data of surgical activities. }, keywords = {activity assessment, computational health, IJCARS, surgical training}, pubstate = {published}, tppubtype = {article} } @inproceedings{2016-Castro-DPHFEVV, title = {Discovering Picturesque Highlights from Egocentric Vacation Video}, author = {Daniel Castro and Vinay Bettadapura and Irfan Essa}, url = {https://ieeexplore.ieee.org/document/7477707 http://www.cc.gatech.edu/cpl/projects/egocentrichighlights/ https://youtu.be/lIONi21y-mk}, doi = {10.1109/WACV.2016.7477707}, year = {2016}, date = {2016-03-01}, urldate = {2016-03-01}, booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)}, abstract = {We present an approach for identifying picturesque highlights from large amounts of egocentric video data. Given a set of egocentric videos captured over the course of a vacation, our method analyzes the videos and looks for images that have good picturesque and artistic properties. We introduce novel techniques to automatically determine aesthetic features such as composition, symmetry and color vibrancy in egocentric videos and rank the video frames based on their photographic qualities to generate highlights. Our approach also uses contextual information such as GPS, when available, to assess the relative importance of each geographic location where the vacation videos were shot. Furthermore, we specifically leverage the properties of egocentric videos to improve our highlight detection. We demonstrate results on a new egocentric vacation dataset which includes 26.5 hours of videos taken over a 14 day vacation that spans many famous tourist destinations and also provide results from a user-study to access our results. }, keywords = {computational photography, computational video, computer vision, WACV}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2015-Zia-AASSUFA, title = {Automated Assessment of Surgical Skills Using Frequency Analysis}, author = {Aneeq Zia and Yachna Sharma and Vinay Bettadapura and Eric Sarin and Mark Clements and Irfan Essa}, url = {https://link.springer.com/chapter/10.1007/978-3-319-24553-9_53 https://rdcu.be/c7CEF}, doi = {10.1007/978-3-319-24553-9_53}, year = {2015}, date = {2015-10-01}, urldate = {2015-10-01}, booktitle = {International Conference on Medical Image Computing and Computer Assisted Interventions (MICCAI)}, abstract = {We present an automated framework for visual assessment of the expertise level of surgeons using the OSATS (Objective Structured Assessment of Technical Skills) criteria. Video analysis techniques for extracting motion quality via frequency coefficients are introduced. The framework is tested on videos of medical students with different expertise levels performing basic surgical tasks in a surgical training lab setting. We demonstrate that transforming the sequential time data into frequency components effectively extracts the useful information differentiating between different skill levels of the surgeons. The results show significant performance improvements using DFT and DCT coefficients over known state-of-the-art techniques. }, keywords = {activity assessment, computational health, surgical training}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2015-Thomaz-PAREMWWIS, title = {A Practical Approach for Recognizing Eating Moments with Wrist-Mounted Inertial Sensing}, author = {Edison Thomaz and Irfan Essa and Gregory Abowd}, url = {https://dl.acm.org/doi/10.1145/2750858.2807545}, doi = {10.1145/2750858.2807545}, year = {2015}, date = {2015-09-01}, urldate = {2015-09-01}, booktitle = {ACM International Conference on Ubiquitous Computing (UBICOMP)}, abstract = {Recognizing when eating activities take place is one of the key challenges in automated food intake monitoring. Despite progress over the years, most proposed approaches have been largely impractical for everyday usage, requiring multiple on-body sensors or specialized devices such as neck collars for swallow detection. In this paper, we describe the implementation and evaluation of an approach for inferring eating moments based on 3-axis accelerometry collected with a popular off-the-shelf smartwatch. Trained with data collected in a semi-controlled laboratory setting with 20 subjects, our system recognized eating moments in two free-living condition studies (7 participants, 1 day; 1 participant, 31 days), with F-scores of 76.1% (66.7% Precision, 88.8% Recall), and 71.3% (65.2% Precision, 78.6% Recall). This work represents a contribution towards the implementation of a practical, automated system for everyday food intake monitoring, with applicability in areas ranging from health research and food journaling. }, keywords = {activity recognition, computational health, machine learning, Ubicomp, ubiquitous computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2015-Castro-PDAFEIUDL, title = {Predicting Daily Activities from Egocentric Images Using Deep Learning}, author = {Daniel Castro and Steven Hickson and Vinay Bettadapura and Edison Thomaz and Gregory Abowd and Henrik Christensen and Irfan Essa}, url = {https://dl.acm.org/doi/10.1145/2802083.2808398 https://arxiv.org/abs/1510.01576 http://www.cc.gatech.edu/cpl/projects/dailyactivities/ }, doi = {10.1145/2802083.2808398}, year = {2015}, date = {2015-09-01}, urldate = {2015-09-01}, booktitle = {Proceedings of International Symposium on Wearable Computers (ISWC)}, abstract = {We present a method to analyze images taken from a passive egocentric wearable camera along with contextual information, such as time and day of the week, to learn and predict the everyday activities of an individual. We collected a dataset of 40,103 egocentric images over 6 months with 19 activity classes and demonstrate the benefit of state-of-the-art deep learning techniques for learning and predicting daily activities. Classification is conducted using a Convolutional Neural Network (CNN) with a classification method we introduce called a late fusion ensemble. This late fusion ensemble incorporates relevant contextual information and increases our classification accuracy. Our technique achieves an overall accuracy of 83.07% in predicting a person's activity across the 19 activity classes. We also demonstrate some promising results from two additional users by fine-tuning the classifier with one day of training data.}, keywords = {activity recognition, computer vision, ISWC, machine learning, wearable computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2015-Thomaz-IMEARWSFASFS, title = {Inferring Meal Eating Activities in Real World Settings from Ambient Sounds: A Feasibility Study}, author = {Edison Thomaz and Cheng Zhang and Irfan Essa and Gregory Abowd}, url = {https://dl.acm.org/doi/10.1145/2678025.2701405}, doi = {10.1145/2678025.2701405}, year = {2015}, date = {2015-05-01}, urldate = {2015-05-01}, booktitle = {ACM Conference on Intelligence User Interfaces (IUI)}, abstract = {Dietary self-monitoring has been shown to be an effective method for weight-loss, but it remains an onerous task despite recent advances in food journaling systems. Semi-automated food journaling can reduce the effort of logging, but often requires that eating activities be detected automatically. In this work we describe results from a feasibility study conducted in-the-wild where eating activities were inferred from ambient sounds captured with a wrist-mounted device; twenty participants wore the device during one day for an average of 5 hours while performing normal everyday activities. Our system was able to identify meal eating with an F-score of 79.8% in a person-dependent evaluation, and with 86.6% accuracy in a person-independent evaluation. Our approach is intended to be practical, leveraging off-the-shelf devices with audio sensing capabilities in contrast to systems for automated dietary assessment based on specialized sensors.}, keywords = {ACM, activity recognition, AI, awards, behavioral imaging, best paper award, computational health, IUI, machine learning}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2015-Bettadapura-LCSAFRR, title = {Leveraging Context to Support Automated Food Recognition in Restaurants}, author = {Vinay Bettadapura and Edison Thomaz and Aman Parnami and Gregory Abowd and Irfan Essa}, url = {http://www.vbettadapura.com/egocentric/food/}, doi = {10.1109/WACV.2015.83}, year = {2015}, date = {2015-01-01}, booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)}, publisher = {IEEE Computer Society}, keywords = {computer vision, WACV}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2015-Hickson-SILLHS, title = {Semantic Instance Labeling Leveraging Hierarchical Segmentation}, author = {Steven Hickson and Irfan Essa and Henrik Christensen}, doi = {10.1109/WACV.2015.147}, year = {2015}, date = {2015-01-01}, booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)}, publisher = {IEEE Computer Society}, keywords = {computer vision, WACV}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2015-Raza-FTCOBUSL, title = {Finding Temporally Consistent Occlusion Boundaries using Scene Layout}, author = {Syed Hussain Raza and Ahmad Humayun and Matthias Grundmann and David Anderson and Irfan Essa}, doi = {10.1109/WACV.2015.141}, year = {2015}, date = {2015-01-01}, booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)}, publisher = {IEEE Computer Society}, keywords = {computer vision, WACV}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2015-Bettadapura-EFLUFPD, title = {Egocentric Field-of-View Localization Using First-Person Point-of-View Devices}, author = {Vinay Bettadapura and Irfan Essa and Caroline Pantofaru}, url = {https://ieeexplore.ieee.org/document/7045943 http://www.vbettadapura.com/egocentric/localization/}, doi = {10.1109/WACV.2015.89}, year = {2015}, date = {2015-01-01}, urldate = {2015-01-01}, booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)}, publisher = {IEEE Computer Society}, abstract = {We present a technique that uses images, videos and sensor data taken from first-person point-of-view devices to perform egocentric field-of-view (FOV) localization. We define egocentric FOV localization as capturing the visual information from a person's field-of-view in a given environment and transferring this information onto a reference corpus of images and videos of the same space, hence determining what a person is attending to. Our method matches images and video taken from the first-person perspective with the reference corpus and refines the results using the first-person's head orientation information obtained using the device sensors. We demonstrate single and multi-user egocentric FOV localization in different indoor and outdoor environments with applications in augmented reality, event understanding and studying social interactions. }, keywords = {awards, best paper award, computer vision, WACV, wearable computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2014-Bidwell-MCVAUMHTFCDSC, title = {Measuring child visual attention using markerless head tracking from color and depth sensing cameras}, author = {Jonathan Bidwell and Irfan Essa and Agata Rozga and Gregory Abowd}, url = {https://dl.acm.org/doi/10.1145/2663204.2663235 http://icmi.acm.org/2014/}, doi = {10.1145/2663204.2663235}, year = {2014}, date = {2014-11-01}, urldate = {2014-11-01}, booktitle = {Proceedings of International Conference on Multimodal Interfaces (ICMI)}, abstract = {A child's failure to respond to his or her name being called is an early warning sign for autism and response to name is currently assessed as a part of standard autism screening and diagnostic tools. In this paper, we explore markerless child head tracking as an unobtrusive approach for automatically predicting child response to name. Head turns are used as a proxy for visual attention. We analyzed 50 recorded response to name sessions with the goal of predicting if children, ages 15 to 30 months, responded to name calls by turning to look at an examiner within a defined time interval. The child's head turn angles and hand annotated child name call intervals were extracted from each session. Human assisted tracking was employed using an overhead Kinect camera, and automated tracking was later employed using an additional forward facing camera as a proof-of-concept. We explore two distinct analytical approaches for predicting child responses, one relying on rule-based approached and another on random forest classification. In addition, we derive child response latency as a new measurement that could provide researchers and clinicians with finer grain quantitative information currently unavailable in the field due to human limitations. Finally we reflect on steps for adapting our system to work in less constrained natural settings. }, keywords = {autism, behavioral imaging, computer vision, ICMI}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2014-Ahsan-TSVFSM, title = {Towards Story Visualization from Social Multimedia}, author = {Unaiza Ahsan and Irfan Essa}, url = {http://compute-cuj.org/cj-2014/cj2014_session5_paper2.pdf http://symposium2014.computation-and-journalism.com/}, year = {2014}, date = {2014-10-01}, urldate = {2014-10-01}, booktitle = {Proceedings of Symposium on Computation and Journalism}, keywords = {computational journalism, computational photography}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2014-Sharma-VBAOUSMT, title = {Video Based Assessment of OSATS Using Sequential Motion Textures}, author = {Yachna Sharma and Vinay Bettadapura and Thomas Ploetz and Nils Hammerla and Sebastian Mellor and Roisin McNaney and Patrick Olivier and Sandeep Deshmukh and Andrew Mccaskie and Irfan Essa}, url = {https://smartech.gatech.edu/bitstream/handle/1853/53651/2014-Sharma-VBAOUSMT.pdf https://www.semanticscholar.org/paper/Video-Based-Assessment-of-OSATS-Using-Sequential-Sharma-Bettadapura/1dde770faa24d4e04306ca6fb85e76dc78876c49}, year = {2014}, date = {2014-09-01}, urldate = {2014-09-01}, booktitle = {Proceedings of Workshop on Modeling and Monitoring of Computer Assisted Interventions (M2CAI)}, abstract = {A fully automated framework for video-based surgical skill assessment is presented that incorporates the sequential and qualitative aspects of surgical motion in a data-driven manner. The Objective Structured Assessment of Technical Skills (OSATS) assessments is replicated, which provides both an overall and in-detail evaluation of basic suturing skills required for surgeons. Video analysis techniques are introduced that incorporate sequential motion aspects into motion textures. Significant performance improvement over standard bag-of-words and motion analysis approaches is demonstrated. The framework is evaluated in a case study that involved medical students with varying levels of expertise performing basic surgical tasks in a surgical training lab setting. }, keywords = {activity assessment, awards, best paper award, computer vision, medical imaging, surgical training}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2014-Raza-DEFVUGCOB, title = {Depth Extraction from Videos Using Geometric Context and Occlusion Boundaries}, author = {Syed Hussain Raza and Omar Javed and Aveek Das and Harpreet Sawhney and Hui Cheng and Irfan Essa}, url = {http://www.cc.gatech.edu/cpl/projects/videodepth/}, year = {2014}, date = {2014-09-01}, booktitle = {British Machine Vision Conference (BMVC)}, address = {Nottingham, UK}, keywords = {depth extraction, video segmentation}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2014-Ahsan-CSEIUKCCA, title = {Clustering Social Event Images Using Kernel Canonical Correlation Analysis}, author = {Unaiza Ahsan and Irfan Essa}, url = {https://openaccess.thecvf.com/content_cvpr_workshops_2014/W20/papers/Ahsan_Clustering_Social_Event_2014_CVPR_paper.pdf https://smartech.gatech.edu/handle/1853/53656}, year = {2014}, date = {2014-06-01}, urldate = {2014-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR) Workshop on Women in Computing (WiC)}, abstract = {Sharing user experiences in form of photographs, tweets, text, audio and/or video has become commonplace in social networking websites. Browsing through large collections of social multimedia remains a cumbersome task. It requires a user to initiate textual search query and manually go through a list of resulting images to find relevant information. We propose an automatic clustering algorithm, which, given a large collection of images, groups them into clusters of different events using the image features and related metadata. We formulate this problem as a kernel canonical correlation clustering problem in which data samples from different modalities or ‘views’ are projected to a space where correlations between the samples’ projections are maximized. Our approach enables us to learn a semantic representation of potentially uncorrelated feature sets and this representation is clustered to give unique social events. Furthermore, we leverage the rich information associated with each uploaded image (such as usernames, dates/timestamps, etc.) and empirically determine which combination of feature sets yields the best clustering score for a dataset of 100,000 images. }, keywords = {activity recognition, computer vision, CVPR, machine learning}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2014-Hickson-EHGSRV, title = {Efficient Hierarchical Graph-Based Segmentation of RGBD Videos}, author = {Steven Hickson and Stan Birchfield and Irfan Essa and Henrik Christensen}, url = {http://www.cc.gatech.edu/cpl/projects/4dseg}, year = {2014}, date = {2014-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, organization = {IEEE Computer Society}, keywords = {computational video, computer vision, CVPR, video segmentation}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2014-Bidwell-APCRNFAV, title = {Automated Prediction of a Child's Response to Name from Audio and Video}, author = {Jonathan Bidwell and Agata Rozga and J. Kim and H. Rao and Mark Clements and Irfan Essa and Gregory Abowd}, url = {https://imfar.confex.com/imfar/2014/webprogram/Paper16999.html https://www.researchgate.net/publication/268143304_Automated_Prediction_of_a_Child's_Response_to_Name_from_Audio_and_Video}, year = {2014}, date = {2014-05-01}, urldate = {2014-05-01}, booktitle = {Proceedings of Annual Conference of the International Society of Autism Research}, organization = {IMFAR}, abstract = {Evidence has shown that a child’s failure to respond to name is an early warning sign for autism and is measured as a part of standard assessments e.g. ADOS [1,2]. Objectives: Build a fully automated system for measuring a child’s response to his or her name being called given video and recorded audio during a social interaction. Here our initial goal is to enable this measurement in a naturalistic setting with the long term goal of eventually obtaining finer gain behavior measurements such as child response time latency between a name call and a response. Methods: We recorded 40 social interactions between an examiner and children (ages 15-24 months). 6 of our 40 child participants showed signs of developmental delay based on standardized parent report measures (M-CHAT, CSBS-ITC, CBCL language development survey). The child sat at a table with a toy to play with. The examiner wore a lapel microphone and called the child’s name up to 3 times while standing to the right and slightly behind the child. These interactions were recorded with two cameras that we used in conjunction with the examiner’s audio for predicting when the child responded. Name calls were measured by 1) detecting when an examiner called the child’s name and 2) evaluating whether the child turned to make eye contact with the examiner. Examiner name calls were detected using a speech detection algorithm. Meanwhile the child’s head turns were tracked using a pair of cameras which consisted of overhead Kinect color and depth camera and a front facing color camera. These speech and head turn measurements were used to train a binary classifier for automatically predicting if and when a child responds to his or her name being called. The result is a system for predicting the child’s response to his or her name being called automatically recorded audio and video of the session. Results: The system was evaluated against human coding of the child’s response to name from video. If the automated prediction fell within +/- 1 second of the human coded response then we recorded a match. Across our 40 sessions we had 56 name calls, 35 responses and 5 children that did not respond to name. Our software correctly predicted children’s response to name with a precision of 90%, recall of 85%.}, keywords = {autism, behavioral imaging, computational health}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2014-Sharma-ASOPFV, title = {Automated Surgical OSATS Prediction from Videos}, author = {Yachna Sharma and Thomas Ploetz and Nils Hammerla and Sebastian Mellor and Roisin McNaney and Patrick Oliver and Sandeep Deshmukh and Andrew McCaskie and Irfan Essa}, year = {2014}, date = {2014-04-01}, booktitle = {Proceedings of IEEE International Symposium on Biomedical Imaging}, address = {Beijing, CHINA}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2013-Senator-DITRCDCUA, title = {Detecting insider threats in a real corporate database of computer usage activity}, author = {Ted E. Senator and Henry G. Goldberg and Alex Memory and William T. Young and Brad Rees and Robert Pierce and Daniel Huang and Matthew Reardon and David A. Bader and Edmond Chow and Irfan Essa and Joshua Jones and Vinay Bettadapura and Duen Horng Chau and Oded Green and Oguz Kaya and Anita Zakrzewska and Erica Briscoe and Rudolph IV L. Mappus and Robert McColl and Lora Weiss and Thomas G. Dietterich and Alan Fern and Weng--Keen Wong and Shubhomoy Das and Andrew Emmott and Jed Irvine and Jay-Yoon Lee and Danai Koutra and Christos Faloutsos and Daniel Corkill and Lisa Friedland and Amanda Gentzel and David Jensen}, url = {http://doi.acm.org/10.1145/2487575.2488213}, doi = {10.1145/2487575.2488213}, isbn = {978-1-4503-2174-7}, year = {2013}, date = {2013-09-01}, booktitle = {Proceedings of the 19th ACM SIGKDD international conference on Knowledge discovery and data mining}, pages = {1393--1401}, publisher = {ACM}, address = {Chicago, Illinois, USA}, series = {KDD '13}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2013-Thomaz-TAAPCWREBWWC, title = {Technological Approaches for Addressing Privacy Concerns when Recognizing Eating Behaviors with Wearable Cameras.}, author = {Edison Thomaz and Aman Parnami and Jonathan Bidwell and Irfan Essa and Gregory Abowd}, doi = {10.1145/2493432.2493509}, year = {2013}, date = {2013-09-01}, urldate = {2013-09-01}, booktitle = {ACM International Joint Conference on Pervasive and Ubiquitous Computing (UBICOMP)}, keywords = {activity recognition, computational health, privacy, Ubicomp, ubiquitous computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2013-Rehg-DCSB, title = {Decoding Children's Social Behavior}, author = {James Rehg and Gregory Abowd and Agata Rozga and Mario Romero and Mark Clements and Stan Sclaroff and Irfan Essa and Opal Ousley and Yin Li and Chanho Kim and Hrishikesh Rao and Jonathan Kim and Liliana Lo Presti and Jianming Zhang and Denis Lantsman and Jonathan Bidwell and Zhefan Ye}, url = {https://ieeexplore.ieee.org/document/6619282 http://www.cbi.gatech.edu/mmdb/ }, doi = {10.1109/CVPR.2013.438}, isbn = {1063-6919}, year = {2013}, date = {2013-06-01}, urldate = {2013-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, organization = {IEEE Computer Society}, abstract = {We introduce a new problem domain for activity recognition: the analysis of children's social and communicative behaviors based on video and audio data. We specifically target interactions between children aged 1-2 years and an adult. Such interactions arise naturally in the diagnosis and treatment of developmental disorders such as autism. We introduce a new publicly-available dataset containing over 160 sessions of a 3-5 minute child-adult interaction. In each session, the adult examiner followed a semi-structured play interaction protocol which was designed to elicit a broad range of social behaviors. We identify the key technical challenges in analyzing these behaviors, and describe methods for decoding the interactions. We present experimental results that demonstrate the potential of the dataset to drive interesting research questions, and show preliminary results for multi-modal activity recognition. }, keywords = {autism, behavioral imaging, computational health, computer vision, CVPR}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2013-Raza-GCFV, title = {Geoemetric Context from Video}, author = {Syed Hussain Raza and Matthias Grundmann and Irfan Essa}, url = {http://www.cc.gatech.edu/cpl/projects/videogeometriccontext/}, doi = {10.1109/CVPR.2013.396}, year = {2013}, date = {2013-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, organization = {IEEE Computer Society}, keywords = {computational video, computer vision, CVPR, video segmentation}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2013-Bettadapura-ABDDTSIAR, title = {Augmenting Bag-of-Words: Data-Driven Discovery of Temporal and Structural Information for Activity Recognition}, author = {Vinay Bettadapura and Grant Schindler and Thomas Ploetz and Irfan Essa}, url = {http://www.cc.gatech.edu/cpl/projects/abow/}, doi = {10.1109/CVPR.2013.338}, year = {2013}, date = {2013-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, organization = {IEEE Computer Society}, keywords = {activity recognition, computational video, computer vision, CVPR}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2013-Grundmann-PARSV, title = {Post-processing Approach for Radiometric Self-Calibration of Video}, author = {Matthias Grundmann and Chris McClanahan and Sing Bing Kang and Irfan Essa}, url = {http://www.cc.gatech.edu/cpl/projects/radiometric}, doi = {10.1109/ICCPhot.2013.6528307}, year = {2013}, date = {2013-04-01}, booktitle = {IEEE Conference on Computational Photography (ICCP)}, organization = {IEEE Computer Society}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2013-Thomaz-FIEMFFILHC, title = {Feasibility of Identifying Eating Moments from First-Person Images Leveraging Human Computation}, author = {Edison Thomaz and Aman Parnami and Irfan Essa and Gregory Abowd}, doi = {10.1145/2526667.2526672}, year = {2013}, date = {2013-01-01}, urldate = {2013-01-01}, booktitle = {Proceedings of ACM International SenseCam and Pervasive Imaging (SenseCam '13)}, keywords = {activity recognition, behavioral imaging, computational health, ubiquitous computing, wearable computing}, pubstate = {published}, tppubtype = {inproceedings} } @article{2013-Hamid-VFTSCUMSC, title = {A Visualization Framework for Team Sports Captured using Multiple Static Cameras}, author = {Raffay Hamid and Ramkrishan Kumar and Jessica Hodgins and Irfan Essa}, url = {http://raffayhamid.com/sports_viz.shtml}, doi = {10.1016/j.cviu.2013.09.006}, issn = {1077-3142}, year = {2013}, date = {2013-01-01}, journal = {Computer Vision and Image Understanding}, number = {0}, pages = {-}, keywords = {}, pubstate = {published}, tppubtype = {article} } @inproceedings{2012-Kim-BSMHE, title = {Beyond Sentiment: The Manifold of Human Emotions}, author = {Seungyeon Kim and Fuxin Li and Guy Lebanon and Irfan Essa}, year = {2013}, date = {2013-01-01}, urldate = {2013-01-01}, booktitle = {Artificial Intelligence and Statistics (AISTATS)}, keywords = {affective computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2012-Hartmann-WSLOSFWV, title = {Weakly Supervised Learning of Object Segmentations from Web-Scale Videos}, author = {Glenn Hartmann and Matthias Grundmann and Judy Hoffman and David Tsai and Vivek Kwatra and Omid Madani and Sudheendra Vijayanarasimhan and Irfan Essa and James Rehg and Rahul Sukthankar}, url = {https://link.springer.com/chapter/10.1007/978-3-642-33863-2_20 https://research.google.com/pubs/archive/40735.pdf }, doi = {10.1007/978-3-642-33863-2_20}, year = {2012}, date = {2012-10-01}, urldate = {2012-10-01}, booktitle = {Proceedings of ECCV 2012 Workshop on Web-scale Vision and Social Media}, abstract = {We propose to learn pixel-level segmentations of objects from weakly labeled (tagged) internet videos. Specifically, given a large collection of raw YouTube content, along with potentially noisy tags, our goal is to automatically generate spatiotemporal masks for each object, such as “dog”, without employing any pre-trained object detectors. We formulate this problem as learning weakly supervised classifiers for a set of independent spatio-temporal segments. The object seeds obtained using segment-level classifiers are further refined using graphcuts to generate high-precision object masks. Our results, obtained by training on a dataset of 20,000 YouTube videos weakly tagged into 15 classes, demonstrate automatic extraction of pixel-level object masks. Evaluated against a ground-truthed subset of 50,000 frames with pixel-level annotations, we confirm that our proposed methods can learn good object masks just by watching YouTube. }, keywords = {awards, best paper award, computer vision, ECCV, machine learning}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2012-Thomaz-RWAHTIS, title = {Recognizing Water-Based Activities in the Home Through Infrastructure-Mediated Sensing}, author = {Edison Thomaz and Vinay Bettadapura and Gabriel Reyes and Megha Sandesh and Grant Schindler and Thomas Ploetz and Gregory Abowd and Irfan Essa}, url = {http://www.ethomaz.com/2012/09/05/activity-rec-ims-ubicomp-2012/}, doi = {10.1145/2370216.2370230}, year = {2012}, date = {2012-09-01}, urldate = {2012-09-01}, booktitle = {ACM International Conference on Ubiquitous Computing (UBICOMP)}, keywords = {aware home, intelligent environments, ubiquitous computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2012-Wang-OASUMC, title = {Orientation Aware Scene Understanding for Mobile Camera}, author = {Jing Wang and Grant Schindler and Irfan Essa}, url = {http://www.cc.gatech.edu/cpl/projects/orientation-aware/}, doi = {10.1145/2370216.2370258}, year = {2012}, date = {2012-09-01}, booktitle = {ACM International Conference on Ubiquitous Computing (UBICOMP)}, keywords = {mobile vision, scene understanding, ubiquitous computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2012-Dantam-LTHATR, title = {Linguistic Transfer of Human Assembly Tasks to Robots}, author = {N. Dantam and I. Essa and M. Stilman}, doi = {10.1109/IROS.2012.6385749}, year = {2012}, date = {2012-01-01}, booktitle = {Intelligent Robots and Systems (IROS)}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2012-Grundmann-CRSR, title = {Calibration-Free Rolling Shutter Removal}, author = {Matthias Grundmann and Vivek Kwatra and Daniel Castro and Irfan Essa}, url = {http://www.cc.gatech.edu/cpl/projects/rollingshutter/ https://research.google.com/pubs/archive/37744.pdf https://youtu.be/_Pr_fpbAok8}, doi = {10.1109/ICCPhot.2012.6215213}, year = {2012}, date = {2012-01-01}, urldate = {2012-01-01}, booktitle = {IEEE Conference on Computational Photography (ICCP)}, publisher = {IEEE Computer Society}, abstract = {We present a novel algorithm for efficient removal of rolling shutter distortions in uncalibrated streaming videos. Our proposed method is calibration free as it does not need any knowledge of the camera used, nor does it require calibration using specially recorded calibration sequences. Our algorithm can perform rolling shutter removal under varying focal lengths, as in videos from CMOS cameras equipped with an optical zoom. We evaluate our approach across a broad range of cameras and video sequences demonstrating robustness, scaleability, and repeatability. We also conducted a user study, which demonstrates preference for the output of our algorithm over other state-of-the art methods. Our algorithm is computationally efficient, easy to parallelize, and robust to challenging artifacts introduced by various cameras with differing technologies. }, keywords = {awards, best paper award, computational photography, computational video, computer graphics, computer vision, ICCP}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2012-Kim-DRIDSWCM, title = {Detecting Regions of Interest in Dynamic Scenes with Camera Motions}, author = {Kihwan Kim and Dongreyol Lee and Irfan Essa}, url = {http://www.cc.gatech.edu/cpl/projects/roi/}, doi = {10.1109/CVPR.2012.6247809}, year = {2012}, date = {2012-01-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, publisher = {IEEE Computer Society}, keywords = {computer vision, CVPR}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2011-Thomaz-ITLADLAML, title = {Interactive Techniques for Labeling Activities Of Daily Living to Assist Machine Learning}, author = {Edison Thomaz and Thoma Pleotz and Irfan Essa and Gregory Abowd}, url = {https://wish2011.wordpress.com/accepted-papers/ https://users.ece.utexas.edu/~ethomaz/papers/w1.pdf}, year = {2011}, date = {2011-11-01}, urldate = {2011-11-01}, booktitle = {Proceedings of Workshop on Interactive Systems in Healthcare}, abstract = {Over the next decade, as healthcare continues its march away from the hospital and towards the home, logging and making sense of activities of daily living will play a key role in health modeling and life-long home care. Machine learning research has explored ways to automate the detection and quantification of these activities in sensor-rich environments. While we continue to make progress in developing practical and cost-effective activity sensing techniques, one large hurdle remains, obtaining labeled activity data to train activity recognition systems. In this paper, we discuss the process of gathering ground truth data with human participation for health modeling applications. In particular, we propose a criterion and design space containing five dimensions that we have identified as central to the characterization and evaluation of interactive labeling methods. }, keywords = {activity recognition, behavioral imaging, computational health, wearable computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2011-Kim-GPRFAMT, title = {Gaussian Process Regression Flow for Analysis of Motion Trajectories}, author = {K. Kim and D. Lee and I. Essa}, url = {http://www.cc.gatech.edu/cpl/projects/gprf/}, year = {2011}, date = {2011-11-01}, booktitle = {IEEE International Conference on Computer Vision (ICCV)}, publisher = {IEEE Computer Society}, keywords = {computer vision, ICCV}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2011-Grundmann-AVSWROCP, title = {Auto-Directed Video Stabilization with Robust L1 Optimal Camera Paths}, author = {M. Grundmann and V. Kwatra and I. Essa}, url = {http://www.cc.gatech.edu/cpl/projects/videostabilization/}, doi = {10.1109/CVPR.2011.5995525}, year = {2011}, date = {2011-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, publisher = {IEEE Computer Society}, keywords = {computational video, computer vision, CVPR}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2011-Sarin-3VORUAMCNPESSE, title = {3-Dimensional Visualization of the Operating Room Using Advanced Motion Capture: A Novel Paradigm to Expand Simulation-Based Surgical Education}, author = {Eric Sarin and Kihwan Kim and Irfan Essa and William Cooper}, year = {2011}, date = {2011-01-01}, urldate = {2011-01-01}, booktitle = {Proceedings of Society of Thoracic Surgeons Annual Meeting}, publisher = {Society of Thoracic Surgeons}, keywords = {computational health, computer vision, intelligent environments, surgical training}, pubstate = {published}, tppubtype = {inproceedings} } @article{2011-Kim-AAEMWDIFV, title = {Augmenting aerial earth maps with dynamic information from videos}, author = {K. Kim and S. Oh and J. Lee and I. Essa}, url = {http://www.cc.gatech.edu/cpl/projects/augearth}, doi = {10.1007/s10055-010-0186-2}, year = {2011}, date = {2011-01-01}, journal = {Journal of Virtual Reality, Special Issue on Augmented Reality}, volume = {15}, number = {2-3}, pages = {1359-4338}, keywords = {}, pubstate = {published}, tppubtype = {article} } @inproceedings{2010-Hamid-PLUMSCSV, title = {Player Localization Using Multiple Static Cameras for Sports Visualization}, author = {Raffay Hamid and Ramkrishan Kumar and Matthias Grundmann and Kihwan Kim and Irfan Essa and Jessica Hodgins}, url = {http://www.raffayhamid.com/sports_viz.shtml}, doi = {10.1109/CVPR.2010.5540142}, year = {2010}, date = {2010-06-01}, urldate = {2010-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society}, keywords = {activity recognition, computer vision, CVPR, sports visualization}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2010-Grundmann-EHGVS, title = {Efficient Hierarchical Graph-Based Video Segmentation}, author = {M. Grundmann and V. Kwatra and M. Han and I. Essa}, url = {http://www.cc.gatech.edu/cpl/projects/videosegmentation/}, doi = {10.1109/CVPR.2010.5539893}, year = {2010}, date = {2010-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, keywords = {computational video, computer vision, CVPR, video segmentation}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2010-Grundmann-DSVR, title = {Discontinuous Seam-Carving for Video Retargeting}, author = {M. Grundmann and V. Kwatra and M. Han and I. Essa}, url = {http://www.cc.gatech.edu/cpl/projects/videoretargeting/}, doi = {10.1109/CVPR.2010.5540165}, year = {2010}, date = {2010-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, publisher = {IEEE Computer Society}, keywords = {computational video, computer vision, CVPR}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2010-Kim-MFPPEDSS, title = {Motion Field to Predict Play Evolution in Dynamic Sport Scenes}, author = {K. Kim and M. Grundmann and A. Shamir and I. Matthews and J. Hodgins and I. Essa}, doi = {10.1109/CVPR.2010.5540128}, year = {2010}, date = {2010-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, keywords = {computer vision, CVPR}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2010-Diakopoulos-MVCVQE, title = {Modulating Video Credibility via Visualization of Quality Evaluations}, author = {N. Diakopoulos and I. Essa}, doi = {10.1145/1772938.1772953}, year = {2010}, date = {2010-04-01}, booktitle = {WWW Workshop on Information Credibility on the Web (WICOW)}, keywords = {computational journalism}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2009-Kim-AAEMWDI, title = {Augmenting Aerial Earth Maps with Dynamic Information}, author = {K. Kim and S. Oh and J. Lee and I. Essa}, url = {http://www.cc.gatech.edu/cpl/projects/augearth/}, doi = {10.1109/ISMAR.2009.5336505}, year = {2009}, date = {2009-10-01}, booktitle = {Proceedings of IEEE International Symposium on Mixed and Augmented Reality (ISMAR)}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @article{2009-Kwatra-FSWAB, title = {Fluid Simulation with Articulated Bodies}, author = {N. Kwatra and C. Wojtan and M. Carlson and I. Essa and P. Mucha and Greg Turk}, year = {2009}, date = {2009-06-01}, journal = {IEEE Transactions on Visualization and Computer Graphics}, keywords = {}, pubstate = {published}, tppubtype = {article} } @article{2009-Hamid-NSRUAHA, title = {A Novel Sequence Representation for Unsupervised Analysis of Human Activities}, author = {R. Hamid and S. Maddi and A. Johnson and A. Bobick and I. Essa and C. Isbell}, year = {2009}, date = {2009-05-01}, journal = {Artificial Intelligence Journal}, keywords = {}, pubstate = {published}, tppubtype = {article} } @inproceedings{2009-Yin-LBUASLUDSFS, title = {Learning Basic Units in American Sign Language using Discriminative Segmental Feature Selection}, author = {P. Yin and T. Starner and H. Hamilton and I. Essa and J. M. Rehg}, year = {2009}, date = {2009-04-01}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, pages = {4757-4760}, abstract = {The natural language for most deaf signers in the United States is American Sign Language (ASL). ASL has internal structure like spoken languages, and ASL linguists have introduced several phonemic models. The study of ASL phonemes is not only interesting to linguists, but also useful for scalability in recognition by machines. Since machine perception is different than human perception, this paper learns the basic units for ASL directly from data. Comparing with previous studies, our approach computes a set of data-driven units (fenemes) discriminatively from the results of segmental feature selection. The learning iterates the following two steps: first apply discriminative feature selection segmentally to the signs, and then tie the most similar temporal segments to re-train. Intuitively, the sign parts indistinguishable to machines are merged to form basic units, which we call ASL fenemes. Experiments on publicly available ASL recognition data show that the extracted data-driven fenemes are meaningful, and recognition using those fenemes achieves improved accuracy at reduced model complexity}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2009-Diakopoulos-VQAOIVBJ, title = {Videolyzer: Quality Analysis of Online Informational Video for Bloggers and Journalists}, author = {N. Diakopoulos and S. Goldenberg and I. Essa}, doi = {10.1145/1518701.1518824}, year = {2009}, date = {2009-04-01}, booktitle = {ACM CHI Conference on Human factors in Computing Systems}, pages = {799-808}, keywords = {computational journalism}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2009-Flagg-HVT, title = {Human Video Textures}, author = {M. Flagg and A. Nakazawa and Q. Zhang and S. B. Kang and Y. K. Ryu and I. Essa and J. M. Rehg}, year = {2009}, date = {2009-03-01}, booktitle = {Proceedings of the ACM Symposium on Interactive 3D Graphics and Games 2009 (I3D '09)}, pages = {199--206}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @article{2009-Rusu-HARUGPFHAS, title = {Human Action Recognition Using Global Point Feature Histograms and Action Shapes}, author = {R. B. Rusu and J. Bandouch and F. Meier and Irfan Essa and M. Beetz}, url = {http://www.ingentaconnect.com/content/vsp/arb/2009/00000023/00000014/art00004}, doi = {10.1163/016918609X12518783330243}, year = {2009}, date = {2009-01-01}, urldate = {2009-01-01}, journal = {Advanced Robotics}, volume = {23}, number = {14}, pages = {1873-1908}, keywords = {activity recognition, computer vision}, pubstate = {published}, tppubtype = {article} } @inproceedings{2008-Grundmann-SCDTAR, title = {3D Shape Context and Distance Transform for Action Recognition}, author = {M. Grundmann and F. Meier and I. Essa}, year = {2008}, date = {2008-12-01}, booktitle = {Proceedings of International Conference on Pattern Recognition (ICPR)}, pages = {1-4}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2008-Diakopoulos-APPTTSTWPG, title = {Audio Puzzler: Piecing Together Time-Stamped Speech Transcripts with a Puzzle Game}, author = {N. Diakopoulos and K. Luther and I. Essa}, doi = {10.1145/1459359.1459507}, year = {2008}, date = {2008-10-01}, booktitle = {ACM International Conference on Multimedia (ACM-MM)}, abstract = {We have developed an audio-based casual puzzle game which produces a time-stamped transcription of spoken audio as a by-product of play. Our evaluation of the game indicates that it is both fun and challenging. The transcripts generated using the game are more accurate than those produced using a standard automatic transcription system and the time-stamps of words are within several hundred milliseconds of ground truth.}, keywords = {social computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2008-Kim-LRUSU, title = {Localization and 3D Reconstruction of Urban Scenes Using GPS}, author = {Kihwan Kim and Jay Summet and Thad Starner and Dan Ashbrook and M. Kapade and Irfan Essa}, year = {2008}, date = {2008-09-01}, urldate = {2008-09-01}, booktitle = {Proceedings of IEEE International Symposium on Wearable Computers (ISWC)}, pages = {11--14}, publisher = {IEEE Computer Society}, keywords = {IMWUT, navigation, wearable computing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2008-Diakopoulos-AMMSIQOV, title = {An Annotation Model for Making Sense of Information Quality in Online Videos}, author = {N. Diakopoulos and I. Essa}, year = {2008}, date = {2008-09-01}, booktitle = {Proceedings of International Conference on the Pragmatic Web 2008}, pages = {31-34}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2008-Yin-DFSHMMUSB, title = {Discriminative Feature Selection for Hidden Markov Models Using Segmental Boosting}, author = {P. Yin and I. Essa and T. Starner and J. M. Rehg}, year = {2008}, date = {2008-03-01}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, address = {Las Vegas, Nevada, USA}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2007-Hamid-SFSUAAUST, title = {Structure from Statistics - Unsupervised Activity Analysis using Suffix Trees}, author = {R. Hamid and S. Maddi and A. Bobick and I. Essa}, url = {http://dx.doi.org/10.1109/ICCV.2007.4408894 }, doi = {10.1109/ICCV.2007.4408894}, year = {2007}, date = {2007-10-14}, urldate = {2007-10-14}, booktitle = {IEEE International Conference on Computer Vision (ICCV)}, publisher = {IEEE Computer Society Press}, abstract = {Models of activity structure for unconstrained environments are generally not available a priori. Recent representational approaches to this end are limited by their computational complexity and ability to capture activity structure only up to some fixed temporal scale. In this work, we propose Suffix Trees as an activity representation to efficiently extract the structure of activities by analyzing their constituent event subsequences over multiple temporal scales. We empirically compare Suffix Trees with some of the previous approaches in terms of feature cardinality, discriminative prowess, noise sensitivity, and activity-class discovery. Finally, exploiting the properties of Suffix Trees, we present a novel perspective on anomalous subsequences of activities and propose an algorithm to detect them in linear time. We present comparative results over experimental data collected from a kitchen environment to demonstrate the competence of our proposed framework. }, keywords = {activity discovery, activity recognition, computer vision, ICCV, IEEE}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2007-Minnen-DSMEAGMPD, title = {Detecting Subdimensional Motifs: An Efficient Algorithm for Generalized Multivariate Pattern Discovery}, author = {D. Minnen and I. Essa and C. Isbell and T. Starner}, url = {https://doi.org/10.1109/ICDM.2007.52}, doi = {10.1109/ICDM.2007.52}, year = {2007}, date = {2007-10-01}, urldate = {2007-10-01}, booktitle = {IEEE International Conference on Data Mining (ICDM)}, abstract = {Discovering recurring patterns in time series data is a fundamental problem for temporal data mining. This paper addresses the problem of locating sub-dimensional motifs in real-valued, multivariate time series, which requires the simultaneous discovery of sets of recurring patterns along with the corresponding relevant dimensions. While many approaches to motif discovery have been developed, most are restricted to categorical data, univariate time series, or multivariate data in which the temporal patterns span all dimensions. In this paper, we present an expected linear-time algorithm that addresses a generalization of multivariate pattern discovery in which each motif may span only a subset of the dimensions. To validate our algorithm, we discuss its theoretical properties and empirically evaluate it using several data sets, including synthetic data and motion capture data collected by an on-body inertial sensor. }, keywords = {activity recognition}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2007-Padoy-BSMSWA, title = {A Boosted Segmentation Method for Surgical Workflow Analysis}, author = {N. Padoy and T. Blum and I. Essa and H. Feussner and M. O. Berger and N. Navab}, doi = {10.1007/978-3-540-75757-3_13}, year = {2007}, date = {2007-10-01}, booktitle = {Proceedings of International Conference on Medical Imaging Computing and Computer Assisted Intervention, (MICCAI)}, publisher = {Springer Lecture Notes in Computer Science (LNCS) series}, address = {Brisbane, Australia}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2007-Diakopoulos-EARS, title = {The Evolution of Authorship in a Remix Society}, author = {N. Diakopoulos and K. Luther and Y. Medynskiy and I. Essa}, doi = {10.1145/1286240.1286272}, year = {2007}, date = {2007-09-01}, booktitle = {ACM Conference on Hypertext and Hypermedia}, publisher = {ACM Press}, address = {Manchester, UK}, abstract = {Authorship entails the constrained selection or generation of media and the organization and layout of that media in a larger structure. But authorship is more than just selection and organization; it is a complex construct incorporating concepts of originality, authority, intertextuality, and attribution. In this paper we explore these concepts and ask how they are changing in light of modes of collaborative authorship in remix culture. We present a qualitative case study of an online video remixing site, illustrating how the constraints of that environment are impacting authorial constructs. We discuss users' self-conceptions as authors, and how values related to authorship are reflected to users through the interface and design of the site's tools. We also present some implications for the design of online communities for collaborative media creation and remixing.}, keywords = {computational journalism}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2007-Parry-PNSF, title = {Phase-Aware Non-negative Spectrogram Factorization}, author = {R. M. Parry and I. Essa}, year = {2007}, date = {2007-09-01}, booktitle = {Proceedings of International Conference on Independent Component Analysis and Blind Signal Separation}, address = {London}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2007-Minnen-DVMMDSUBS, title = {Discovering Variable-Length Motifs in Multivariate Data Streams using Bayesian Surprise}, author = {D. Minnen and T. Starner and I. Essa and C. Isbell}, year = {2007}, date = {2007-08-01}, booktitle = {Proceedings of International Conference on Knowledge Discovery and Data Mining (KDDM)}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @article{2007-Rogers-DTC, title = {Designing a Technology Coach}, author = {W. Rogers and I. Essa and A. Fisk}, url = {https://doi.org/10.1177/1064804607015003}, doi = {10.1177/1064804607015003}, year = {2007}, date = {2007-07-01}, urldate = {2007-07-01}, journal = {Ergonomics in Design, Journal of the Human Factors and Ergonomics Society}, volume = {15}, number = {3}, pages = {17--23}, abstract = {Technology in the home environment has the potential to support older adults in a variety of ways. We took an interdisciplinary approach (human factors/ergonomics and computer science) to develop a technology “coach” that could support older adults in learning to use a medical device. Our system provided a computer vision system to track the use of a blood glucose meter and provide users with feedback if they made an error. This research could support the development of an in-home personal assistant to coach individuals in a variety of tasks necessary for independent living. }, keywords = {aging-in-place, aware home, human-computer interaction}, pubstate = {published}, tppubtype = {article} } @inproceedings{2007-Yin-TCBVS, title = {Tree-based Classifiers for Bilayer Video Segmentation}, author = {P. Yin and A. Criminisi and J. Winn and I. Essa}, year = {2007}, date = {2007-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, pages = {pp 1--8}, publisher = {IEEE Computer Society}, address = {Minneapolis, MN, USA}, keywords = {computational video, computer vision, CVPR, video segmentation}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2007-Minnen-DMMUSD, title = {Discovering Multivariate Motifs using Subsequence DensityEstimation}, author = {D. Minnen and C. Isbell and I. Essa and T. Starner}, url = {http://www.aaai.org/Library/AAAI/2007/aaai07-097.php}, year = {2007}, date = {2007-04-01}, booktitle = {American Association of Artificial Intelligence Conference (AAAI)}, organization = {AAAI}, abstract = {The problem of locating motifs in real-valued, multivariate time series data involves the discovery of sets of recurring patterns embedded in the time series. Each set is composed of several non-overlapping subsequences and constitutes a motif because all of the included subsequences are similar. The ability to automatically discover such motifs allows intelligent systems to form endogenously meaningful representations of their environment through unsupervised sensor analysis. In this paper, we formulate a unifying view of motif discovery as a problem of locating regions of high density in the space of all time series subsequences. Our approach is efficient (sub-quadratic in the length of the data), requires fewer user-specified parameters than previous methods, and naturally allows variable length motif occurrences and non-linear temporal warping. We evaluate the performance of our approach using four data sets from different domains including on-body inertial sensors and speech.}, keywords = {activity discovery, motif discovery, unsupervised learning}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2007-Parry-IPISSSF, title = {Incorporating Phase Information for Source Separation via Spectrogram Factorization}, author = {R. M. Parry and I. Essa}, year = {2007}, date = {2007-04-01}, booktitle = {Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, address = {Honolulu, Hawaii}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2007-Minnen-IADWANE, title = {Improving Activity Disocvery with Automatic Neighborhood Estimation}, author = {D. Minnen and T. Starner and I. Essa and C. Isbell}, year = {2007}, date = {2007-01-01}, booktitle = {Proceedings of International Joint Conference on Artificial Intelligence (IJCAI)}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2006-Parry-SFUPI, title = {Spectrogram Factorization Using Phase Information}, author = {R. M. Parry and I. Essa}, year = {2006}, date = {2006-12-01}, booktitle = {Proceedings of Neural Information Processing Systems: Workshop on Advances in Models for Acoustic Processing}, address = {Whistler, Canada}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2006-Diakopoulos-VAPDVST, title = {Videotater: An Approach for Pen-Based Digital Video Segmentation and Tagging}, author = {N. Diakopoulos and I. Essa}, doi = {10.1145/1166253.1166287}, year = {2006}, date = {2006-10-01}, booktitle = {ACM Symposium on User Interface Software and Technology (UIST)}, abstract = {The continuous growth of media databases necessitates development of novel visualization and interaction techniques to support management of these collections. We present Videotater, an experimental tool for a Tablet PC that supports the efficient and intuitive navigation, selection, segmentation, and tagging of video. Our veridical representation immediately signals to the user where appropriate segment boundaries should be placed and allows for rapid review and refinement of manually or automatically generated segments. Finally, we explore a distribution of modalities in the interface by using multiple timeline representations, pressure sensing, and a tag painting/erasing metaphor with the pen.}, keywords = {computational video}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2006-Kim-IMGVN, title = {Interactive Mosaic Generation for Video Navigation}, author = {Kihwan Kim and Irfan Essa and Gregory Abowd }, url = {https://doi.org/10.1145/1180639.1180776}, doi = {10.1145/1180639.1180776}, year = {2006}, date = {2006-10-01}, urldate = {2006-10-01}, booktitle = {ACM International Conference on Multimedia (ACM-MM)}, address = {Santa Barbara,CA,USA}, abstract = {Navigation through large multimedia collections that include videos and images still remains cumbersome. In this paper, we introduce a novel method to visualize and navigate through the collection by creating a mosaic image that visually represents the compilation. This image is generated by a labeling-based layout algorithm using various sizes of sample tile images from the collection. Each tile represents both the photographs and video files representing scenes selected by matching algorithms. This generated mosaic image provides a new way for thematic video and visually summarizes the videos. Users can generate these mosaics with some predefined themes and layouts, or base it on the results of their queries. Our approach supports automatic generation of these layouts by using meta-information such as color, time-line and existence of faces or manually generated annotated information from existing systems (e.g., the Family Video Archive).}, keywords = {ACMMM, computational video, multimedia}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2006-Minnen-DCAFOSD, title = {Discovering Characteristic Actions from On-Body Sensor Data}, author = {D. Minnen and T. Starner and I. Essa and C. Isbell}, doi = {http://dx.doi.org/10.1109/ISWC.2006.286337}, year = {2006}, date = {2006-10-01}, booktitle = {Proceedings of IEEE International Symposium on Wearable Computers (ISWC)}, pages = {11-18}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2006-Shi-LTSMFPLD, title = {Learning Temporal Sequence Model from Partially Labeled Data}, author = {Y. Shi and A. Bobick and I. Essa}, year = {2006}, date = {2006-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, pages = {1631 - 1638}, publisher = {IEEE Computer Society}, keywords = {activity recognition, computational video, computer vision, CVPR}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2006-Parry-SDURS, title = {Source Detection Using Repetitive Structure}, author = {R. M. Parry and I. Essa}, year = {2006}, date = {2006-05-01}, booktitle = {Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, volume = {4}, pages = {1093--1096}, address = {Toulouse, France}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2006-Minnen-ADSMFMTS, title = {Activity Discovery: Sparse Motifs from Multivariate Time Series}, author = {D. Minnen and T. Starner and I. Essa and C. Isbell}, year = {2006}, date = {2006-04-01}, booktitle = {Proceedings of The Learning Workshop at Snowbird}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2006-Parry-ESPSCA, title = {Estimating the Spatial Position of Spectral Components in Audio}, author = {R. M. Parry and I. Essa}, year = {2006}, date = {2006-03-01}, booktitle = {Proceedings of International Conference on Independent Component Analysis and Blind Signal Separation}, pages = {666--673}, publisher = {Springer}, address = {Charleston, SC}, series = {Lecture Notes in Computer Science (LNCS)}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2006-Choi-EEMVFC, title = {Element-Free Elastic Models for Volume Fitting and Capture}, author = {J. Choi and A. Szymczak and G. Turk and I. Essa}, url = {https://ieeexplore.ieee.org/document/1641028}, doi = {10.1109/CVPR.2006.110}, year = {2006}, date = {2006-01-01}, urldate = {2006-01-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, volume = {2}, pages = {2245--2252}, abstract = {We present a new method of fitting an element-free volumetric model to a sequence of deforming surfaces of a moving object. Given a sequence of visual hulls, we iteratively fit an element-free elastic model to the visual hull in order to extract the optimal pose of the captured volume. The fitting of the volumetric model is acheived by minimizing a combination of elastic potential energy, a surface distance measure, and a self-intersection penalty for each frame. A unique aspect of our work is that the model is mesh free - since the model is represented as a point cloud, it is easy to construct, manipulate and update the model as needed. Additionally, linear elasicity with rotation compensation makes it possible to handle local deformations and large rotations of body parts much more efficiently than other volume fitting approaches. Our experimental results for volume fitting and capture in a multi-view camera setting demonstrate the robustness of element-free elastic models against noise and self-occlusions. }, keywords = {computer vision, CVPR, shape modeling}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2006-Hamid-UAASUEM, title = {Unsupervised Analysis of Activity Sequences Using Event Motifs}, author = {R. Hamid and S. Maddi and A. Bobick and I. Essa}, year = {2006}, date = {2006-01-01}, booktitle = {Proceedings of ACM International Workshop on Video Surveillance and Sensor Networks (IWVSSN)}, organization = {ACM}, keywords = {activity recognition}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2005-Diakopoulos-SPMA, title = {Supporting Personal Media Authoring}, author = {N. Diakopoulos and I. Essa}, year = {2005}, date = {2005-11-01}, booktitle = {Proceedings of the Workshop on Multimedia for Human Communication (MHC) at ACM Multimedia}, pages = {21 - 23}, address = {Singapore}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2005-Diakopoulos-MPCA, title = {Mediating Photo Collage Authoring}, author = {N. Diakopoulos and I. Essa}, url = {http://www.nickdiakopoulos.com/projects/mediating-photo-collage-authoring/}, doi = {10.1145/1095034.1095065}, year = {2005}, date = {2005-10-01}, booktitle = {ACM Symposium on User Interface Software and Technology (UIST)}, pages = {183 - 186}, address = {Seattle}, keywords = {content creation}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2005-Parry-BSSURS, title = {Blind Source Separation Using Repetitive Structure}, author = {R. M. Parry and I. Essa}, year = {2005}, date = {2005-09-01}, booktitle = {Proceedings of International Conference on Digital Audio Effects}, pages = {143--148}, address = {Madrid}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @article{2005-Kwatra-TOES, title = {Texture Optimization for Example-based Synthesis}, author = {V. Kwatra and I. Essa and A. Bobick and N. Kwatra}, url = {https://dl.acm.org/doi/10.1145/1186822.1073263 https://www.cc.gatech.edu/gvu/perception/projects/textureoptimization/ https://youtu.be/Ys_U46-FeEM http://www.cc.gatech.edu/gvu/perception/projects/textureoptimization/TextureOptimization_DVD.mov http://www.cc.gatech.edu/gvu/perception/projects/textureoptimization/TO-sig05.ppt http://www.cc.gatech.edu/gvu/perception/projects/textureoptimization/TO-final.pdf }, doi = {10.1145/1073204.1073263}, year = {2005}, date = {2005-08-01}, urldate = {2005-08-01}, journal = {ACM SIGGRAPH Proceedings of Annual Conference on Computer graphics and interactive techniques}, volume = {24}, number = {3}, pages = {795--802}, abstract = {We present a novel technique for texture synthesis using optimization. We define a Markov Random Field (MRF)-based similarity metric for measuring the quality of synthesized texture concerning a given input sample. This allows us to formulate the synthesis problem as the minimization of an energy function, which is optimized using an Expectation Maximization (EM)-like algorithm. In contrast to most example-based techniques that do region-growing, ours is a joint optimization approach that progressively refines the entire texture. Additionally, our approach is ideally suited to allow for the controllable synthesis of textures. Specifically, we demonstrate controllability by animating image textures using flow fields. We allow for general two-dimensional flow fields that may dynamically change over time. Applications of this technique include dynamic texturing of fluid animations and texture-based flow visualization.}, keywords = {ACM, computational video, computer animation, computer graphics, computer vision, SIGGRAPH}, pubstate = {published}, tppubtype = {article} } @inproceedings{2005-Hamid-DCAFE, title = {Discovery and Characterization of Activities from Event-Streams}, author = {R. Hamid and S. Maddi and A. Johnson and A. Bobick and I. Essa and C. Isbell}, year = {2005}, date = {2005-07-01}, booktitle = {Uncertainty in Artificial Intelligence (UAI)}, pages = {251-258}, address = {Edinburgh, SCOTLAND}, keywords = {activity recognition}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2005-Huang-TMOTO, title = {Tracking Multiple Objects Through Occlusions}, author = {Y. Huang and I. Essa}, year = {2005}, date = {2005-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, pages = {1051--1058}, publisher = {IEEE Computer Society}, address = {San Diego, CA, USA}, keywords = {activity recognition, computational video, computer vision, CVPR}, pubstate = {published}, tppubtype = {inproceedings} } @article{2005-Angelov-EWOSACE, title = {Experiences with optimizing two stream-based applications for cluster execution.}, author = {Y. Angelov and Umakishire Ramachandran and Ken Mackenzie and James Rehg and Irfan Essa}, year = {2005}, date = {2005-01-01}, urldate = {2005-01-01}, journal = {Journal of Parallel and Distributed Computing}, volume = {65}, number = {6}, pages = {678-691}, keywords = {audio-video fusion, intelligent environments, multimedia}, pubstate = {published}, tppubtype = {article} } @inproceedings{2005-Hamid-UADCFE, title = {Unsupervised Activity Discovery and Characterization From Event-Streams}, author = {Raffay Hamid and Siddhartha Maddi and Amos Johnson and Aaron Bobick and Irfan Essa and Charles Isbell}, url = {https://arxiv.org/abs/1207.1381 https://arxiv.org/pdf/1207.1381}, doi = {10.48550/arXiv.1207.1381}, year = {2005}, date = {2005-01-01}, urldate = {2005-01-01}, booktitle = {Proceedings of The Learning Workshop at Snowbird}, address = {Snowbird, Utah}, abstract = {We present a framework to discover and characterize different classes of everyday activities from event-streams. We begin by representing activities as bags of event n-grams. This allows us to analyze the global structural information of activities, using their local event statistics. We demonstrate how maximal cliques in an undirected edge-weighted graph of activities, can be used for activity-class discovery in an unsupervised manner. We show how modeling an activity as a variable length Markov process, can be used to discover recurrent event-motifs to characterize the discovered activity-classes. We present results over extensive data-sets, collected from multiple active environments, to show the competence and generalizability of our proposed framework. }, keywords = {activity discovery, activity recognition, computer vision, machine learning}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2005-Kim-VNEIM, title = {Video-based nonphotorealistic and expressive illustration of motion}, author = {B. Kim and I. Essa}, year = {2005}, date = {2005-01-01}, booktitle = {Proceedings of Computer Graphics International (CGI)}, pages = {32 - 35}, publisher = {IEEE Computer Society Press}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2004-Parry-FWS, title = {Feature Weighting for Segmentation}, author = {R. M. Parry and I. Essa}, year = {2004}, date = {2004-10-01}, booktitle = {Proceedings of International Conference on Music Information Retrieval}, pages = {116--119}, address = {Barcelona}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2004-Yin-ABSR, title = {Asymmetrically Boosted HMM for Speech Reading}, author = {P. Yin and I. Essa and J. M. Rehg}, year = {2004}, date = {2004-06-01}, urldate = {2004-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, pages = {pp II755-761}, publisher = {IEEE Computer Society}, address = {Washington DC, USA}, keywords = {audio-video fusion, computer vision, CVPR, speech reading}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2004-Hays-IVBPA, title = {Image and video based painterly animation}, author = {J. Hays and I. Essa}, url = {http://www-static.cc.gatech.edu/gvu/perception/projects/artstyling/}, doi = {10.1145/987657.987676}, year = {2004}, date = {2004-06-01}, booktitle = {ACM Conference on Non-Photorealistic Animation and Rendering (NPAR)}, pages = {113--120}, publisher = {ACM Press}, address = {New York, NY, USA}, keywords = {Non-photorealistic Rendering}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2004-Diakopoulos-CBIS, title = {Content Based Image Synthesis.}, author = {N. Diakopoulos and I. Essa and R. Jain}, year = {2004}, date = {2004-01-01}, booktitle = {Proceedings of Conference on Content-Based Image and Video Retrieval (CIVR)}, pages = {299-307}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2004-Shi-PNRPOSA, title = {Propagation Networks for recognition of partially ordered sequential action}, author = {Y. Shi and Y. Huang and D. Minnen and A. Bobick and I. Essa}, year = {2004}, date = {2004-01-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, pages = {862-869}, publisher = {IEEE Computer Society}, address = {Washington, DC}, keywords = {activity recognition, computational video, computer vision, CVPR}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2004-Brostow-NSRAC, title = {Novel Skeletal Representation For Articulated Creatures}, author = {G. J. Brostow and I. Essa and D. Steedly and V. Kwatra}, year = {2004}, date = {2004-01-01}, booktitle = {Proceedings of European Conference on Computer Vision (ECCV)}, pages = {Vol III: 66-78}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2004-Covington-PA, title = {Parameterized Authentication.}, author = {Michael Covington and Mustaque Ahamad and Irfan Essa and H. Venkateswaran}, year = {2004}, date = {2004-01-01}, urldate = {2004-01-01}, booktitle = {Proceedings of European Symposium on Research in Computer Security (ESORICS)}, pages = {276--292}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2003-Yin-BASR-asilomar, title = {Boosted Audio-Visual HMM for Speech Reading}, author = {P. Yin and I. Essa and J. M. Rehg}, year = {2003}, date = {2003-11-01}, booktitle = {Asilomar Conference on Signals, Systems, and Computers}, pages = {2013-2018}, address = {Asilomar, CA, USA}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2003-Yin-BASR, title = {Boosted Audio-Visual HMM for Speech Reading}, author = {P. Yin and I. Essa and J. M. Rehg}, year = {2003}, date = {2003-10-01}, booktitle = {International Workshop on Analysis and Modeling of Faces and Gestures (AMFG)}, pages = {68--73}, address = {Nice, France}, series = {held in conjunction with IEEE ICCV 2003}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2003-Parry-RSTE, title = {Rhythmic Similarity through Elaboration}, author = {R. M. Parry and I. Essa}, year = {2003}, date = {2003-10-01}, booktitle = {Proceedings of International Conference on Music Information Retrieval}, pages = {251--252}, address = {Baltimore, MD}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @article{2003-Kwatra-GTIVSUGC, title = {Graphcut Textures: Image and Video Synthesis Using Graph Cuts}, author = {V. Kwatra and A. Schödl and I. Essa and G. Turk and A. Bobick}, year = {2003}, date = {2003-07-01}, journal = {ACM SIGGRAPH Proceedings of Annual Conference on Computer graphics and interactive techniques}, volume = {22}, number = {3}, pages = {277--286}, keywords = {}, pubstate = {published}, tppubtype = {article} } @inproceedings{2003-Minnen-EGLHEAR, title = {Expectation Grammars: Leveraging High-Level Expectations for Activity Recognition}, author = {D. Minnen and I. Essa and T. Starner}, year = {2003}, date = {2003-06-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, pages = {626-632}, keywords = {activity recognition, computational video, computer vision, CVPR}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2003-Steedly-SPSFM, title = {Spectral Partitioning for Structure from Motion}, author = {D. Steedly and I. Essa and F. Dellaert}, year = {2003}, date = {2003-01-01}, booktitle = {IEEE International Conference on Computer Vision (ICCV)}, pages = {996--1003}, publisher = {IEEE Computer Society}, address = {Nice, France}, keywords = {computer vision, ICCV}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2003-Xu-MHPASBSS, title = {Mandatory Human Participation: A New Authentication Scheme for Building Secure Systems}, author = {J. Xu and R. Lipton and I. Essa and M. Sung and Y. Zhu}, year = {2003}, date = {2003-01-01}, booktitle = {Proceedings of International Conference on Computer Communications and Networks}, address = {Dallas, Texas, USA}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2003-Hamid-AARUGM, title = {ARGMode - Activity Recognition using Graphical Models}, author = {R. Hamid and Y. Huang and I. Essa}, year = {2003}, date = {2003-01-01}, booktitle = {Proceedings of IEEE Workshop on Event Mining, Event Detection, and Recognition in Video}, volume = {4}, pages = {38--44}, publisher = {IEEE Computer Society}, address = {Madison, WI}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2003-Haro-EBST, title = {Exemplar Based Surface Texture}, author = {Antonio Haro and Irfan Essa}, editor = {T Ertl. and B. Girod and G. Greiner and H. Niemann and H. -P. Seidel and E. Steinbach and R. Westermann}, year = {2003}, date = {2003-01-01}, urldate = {2003-01-01}, booktitle = {Proceedings of Conference on Vision, Modeling, and Visualization}, pages = {95--101}, publisher = {Akademische Verlagsgesellschaft Aka GmbH, Berlin}, address = {Munich, Germany}, organization = {Computer Graphics & Visualization Group and the Media Technology Group of Technische Universitat Munchen}, keywords = {computer graphics, computer vision, video textures}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2003-Ruddarraju-FMCHPT, title = {Fast Multiple Camera Head Pose Tracking}, author = {R. Ruddarraju and A. Haro and I. Essa}, year = {2003}, date = {2003-01-01}, booktitle = {Proceedings of International Conference on Vision Interface (VI)}, address = {Halifax, Canada}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2003-Ruddarraju-PUIUVT, title = {Perceptual User Interfaces using Vision-Based Eye Tracking}, author = {Ravi Ruddarraju and Antonio Haro and Kris Nagel and Quan Tran and Irfan Essa and Gregory Abowd and Elizabeth Mynatt}, url = {https://doi.org/10.1145/958432.958475}, doi = {10.1145/958432.958475}, year = {2003}, date = {2003-01-01}, urldate = {2003-01-01}, booktitle = {International Conference on Multimodal Interfaces (ICMI)}, address = {Vancouver, Canada}, abstract = {We present a multi-camera vision-based eye tracking method to robustly locate and track user's eyes as they interact with an application. We propose enhancements to various vision-based eye-tracking approaches, which include (a) the use of multiple cameras to estimate head pose and increase coverage of the sensors and (b) the use of probabilistic measures incorporating Fisher's linear discriminant to robustly track the eyes under varying lighting conditions in real-time. We present experiments and quantitative results to demonstrate the robustness of our eye tracking in two application prototypes. }, keywords = {computer vision, eye-tracking, ICMI, multimodal interfaces}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2002-Schodl-CAVS, title = {Controlled Animation of Video Sprites}, author = {A. Schödl and I. Essa}, url = {http://www-static.cc.gatech.edu/gvu/perception/projects/videotexture/SCA02/index.html}, doi = {10.1145/545261.545281}, year = {2002}, date = {2002-08-01}, booktitle = {ACM/Eurographics Symposium on Computer Animation (SCA)}, publisher = {ACM Press}, address = {San Antonio, TX, USA}, organization = {ACM SIGGRAPH}, abstract = {We introduce a new optimization algorithm for video sprites to animate realistic-looking characters. Video sprites are animations created by rearranging recorded video frames of a moving object. Our new technique to find good frame arrangements is based on repeated partial replacements of the sequence. It allows the user to specify animations using a flexible cost function. We also show a fast technique to compute video sprite transitions and a simple algorithm to correct for perspective effects of the input footage. We use our techniques to create character animations of animals, which are difficult both to train in the real world and to animate as 3D models.}, keywords = {computer animation}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2002-Abowd-AHDTSA, title = {The Aware Home: Developing Technologies for Successful Aging}, author = {Gregory Abowd and Aaron Bobick and Irfan Essa and Elizabeth Mynatt and Wendy Rogers}, url = {https://www.academia.edu/13330406/The_aware_home_A_living_laboratory_for_technologies_for_successful_aging}, year = {2002}, date = {2002-08-01}, urldate = {2002-08-01}, booktitle = {AAAI Workshop on Automation as a Care Giver}, address = {Edmonton, Alberta, Canada}, organization = {AAAI}, series = {Held in conjunction with American Association of Artificial Intelligence (AAAI) Conference 2002}, abstract = {We describe our ongoing research in the area of developing and testing technologies for successful aging. Sensing and perception technologies can enable a home environment to be aware of the whereabouts and activities of its occupants. Motivated by the desire to use such an awareness to help maintain independence and quality of life for an aging population, we describe the technological, design and engineering research challenges inherent in this problem domain. Our work is situated in the Georgia Tech Broadband Institute's Residential Laboratory, a unique living laboratory for this exploration of ubiquitous computing in a domestic setting.}, keywords = {AAAI, aging-in-place, intelligent environments}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2002-Moore-RMAFVURMAFVUSCG, title = {Recognizing Multitasked Activities from Video using Recognizing Multitasked Activities from Video using Stochastic Context-Free Grammar}, author = {D. Moore and I. Essa}, year = {2002}, date = {2002-07-01}, booktitle = {American Association of Artificial Intelligence Conference (AAAI)}, address = {Edmonton, Alberta, Canada}, organization = {AAAI}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2002-Essa-BAHTL, title = {Building and Aware Home: Technologies for the way we may live}, author = {Irfan Essa and Gregory Abowd and Aaron Bobick and Elizabeth Mynatt and Wendy Rogers}, year = {2002}, date = {2002-01-01}, urldate = {2002-01-01}, booktitle = {Proceedings of First International Workshop on Man-Machine Symbiosis}, address = {Kyoto, Japan}, keywords = {aging-in-place, computational health, human-computer interaction}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2002-Haro-LVPE, title = {Learning Video Processing by Example}, author = {A. Haro and I. Essa}, editor = {R. Kasturi and D. Laurendeau and C. Suen}, year = {2002}, date = {2002-01-01}, booktitle = {Proceedings of International Conference on Pattern Recognition (ICPR)}, volume = {1}, pages = {487--491}, address = {Quebec, Canada}, organization = {CIPPRS and IAPR and IEEE}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2001-Reveret-VTCSRFA, title = {Visual Tracking and Coding of Speech Related Facial Actions}, author = {L. Reveret and I. Essa}, year = {2001}, date = {2001-12-01}, booktitle = {Proceedings of IEEE Workshop on Cues in Communication}, publisher = {IEEE Computer Society}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2001-Brostow-IMBSMA, title = {Image-Based Motion Blur for Stop Motion Animation}, author = {G. J. Brostow and I. Essa}, editor = {Eugene Fiume}, url = {http://www-static.cc.gatech.edu/gvu/perception/projects/blur/index.html}, doi = {10.1145/383259.383325}, year = {2001}, date = {2001-07-01}, booktitle = {ACM SIGGRAPH Proceedings of Annual Conference on Computer graphics and interactive techniques}, pages = {561--566}, publisher = {ACM Press / ACM SIGGRAPH}, organization = {ACM}, series = {Computer Graphics Proceedings, Annual Conference Series}, keywords = {computational video, motion blur, optical flow}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2001-Steedly-PIINLSFM, title = {Propagation of Innovative Information in Non-Linear Least-Squares Structure from Motion}, author = {D. Steedly and I. Essa}, year = {2001}, date = {2001-01-01}, booktitle = {IEEE International Conference on Computer Vision (ICCV)}, volume = {2}, pages = {223--229}, address = {Vancouver, Canada}, keywords = {computer vision, ICCV}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2001-Stillman-TRMSAE, title = {Towards Reliable Multimodal Sensing in Aware Environments}, author = {S. Stillman and I. Essa}, year = {2001}, date = {2001-01-01}, booktitle = {Workshop on Perceptual User Interfaces (PUI)}, publisher = {ACM Press}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2001-Haro-RPPBRFSHSS, title = {Real-time, Photo-realistic, Physically Based Rendering of Fine Scale Human Skin Structure}, author = {A. Haro and B. Guenter and I. Essa}, editor = {S. Gortler and K. Myszkowski}, year = {2001}, date = {2001-01-01}, booktitle = {Proceedings of Eurographics Workshop on Rendering (ESR)}, pages = {53--62}, address = {London, England}, organization = {Eurographics}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2001-Schodl-DLFO, title = {Depth layers from occlusions}, author = {A. Schödl and I. Essa}, url = {http://www.cc.gatech.edu/cpl/projects/depthfromocclusion/}, doi = {http://dx.doi.org/10.1109/CVPR.2001.990534}, year = {2001}, date = {2001-01-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, volume = {1}, pages = {639 - 644}, publisher = {IEEE Computer Society Press}, address = {Kauai, HAWAI, US}, organization = {IEEE Computer Society}, keywords = {computer vision, CVPR}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2000-Schodl-MLVR, title = {Machine Learning for Video-Based Rendering.}, author = {A. Schödl and I. Essa}, url = {https://www.cc.gatech.edu/cpl/projects/videotexture/NIPS2000/index.html}, year = {2000}, date = {2000-12-01}, booktitle = {Advances in Neural Information Processing Systems (NeurIPS)}, pages = {1002-1008}, keywords = {computer animation, reinforcement learning, video textures}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2000-Mynatt-IOAP, title = {Increasing the Opportunities for Aging in Place}, author = {E. Mynatt and I. Essa and W. Rogers}, doi = {10.1145/355460.355475}, year = {2000}, date = {2000-11-01}, booktitle = {ACM Conference on Universal Usability (CUU)}, abstract = {A growing social problem in the U.S. and elsewhere is supporting older adults who want to continue living independently as opposed to moving to an institutional care setting. The ``Aging in Place'' project strives to delay taking that first step away from the family home. Through the careful placement of technological support we believe older adults can continue living in their own homes longer. The goal of our research is to take a three-pronged approach to understanding the potential of such environmental supports. The research team combines expertise in human-computer-interaction, computational perception, and cognitive aging. Together the team is assessing the feasibility of designing environments that aid older individuals in maintaining their independence. Based on our initial research, we are dividing this work into three parts: recognizing and adverting crisis, assisting daily routines, and supporting peace of mind for adult children.}, keywords = {aging-in-place}, pubstate = {published}, tppubtype = {inproceedings} } @article{2000-Essa-USSAE, title = {Ubiquitous sensing for smart and aware environments}, author = {I. Essa}, doi = {10.1109/98.878538}, year = {2000}, date = {2000-10-01}, journal = {IEEE Personal Communications}, keywords = {}, pubstate = {published}, tppubtype = {article} } @inproceedings{2000-Schodl-VT, title = {Video textures}, author = {A. Schödl and R. Szeliski and D. H. Salesin and I. Essa}, doi = {10.1145/344779.345012}, year = {2000}, date = {2000-08-01}, booktitle = {ACM SIGGRAPH Proceedings of Annual Conference on Computer graphics and interactive techniques}, pages = {489--498}, publisher = {ACM Press/Addison-Wesley Publishing Co.}, address = {New York, NY, USA}, abstract = {This paper introduces a new type of medium, called a video texture, which has qualities somewhere between those of a photograph and a video. A video texture provides a continuous infinitely varying stream of images. While the individual frames of a video texture may be repeated from time to time, the video sequence as a whole is never repeated exactly. Video textures can be used in place of digital photos to infuse a static image with dynamic qualities and explicit actions. We present techniques for analyzing a video clip to extract its structure, and for synthesizing a new, similar looking video of arbitrary length. We combine video textures with view morphing techniques to obtain 3D video textures. We also introduce video-based animation, in which the synthesis of video textures can be guided by a user through high-level interactive controls. Applications of video textures and their extensions include the display of dynamic scenes on web pages, the creation of dynamic backdrops for special effects and games, and the interactive control of video-based animation.}, keywords = {computational video}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2000-Essa-CDVSE, title = {A Course on Digital Video Special Effects}, author = {I. Essa and G. J. Brostow}, year = {2000}, date = {2000-06-01}, booktitle = {Proceedings of IEEE CS Workshop on Undergraduate Education and Image Computation}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2000-Abowd-LLFCEGGIT, title = {Living laboratories: the future computing environments group at the Georgia Institute of Technology}, author = {Gregory Abowd and Chris Atkeson and Aaron Bobick and Irfan Essa and Blair MacIntyre and Elizabeth Mynatt and Thad Starner}, doi = {10.1145/633292.633416}, year = {2000}, date = {2000-04-01}, urldate = {2000-04-01}, booktitle = {ACM CHI Conference on Human factors in Computing Systems}, pages = {215--216}, publisher = {ACM Press}, address = {New York, NY, USA}, keywords = {aging-in-place, CHI, computational health, intelligent environments}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2000-Haro-NCVSRT, title = {A Non-Invasive Computer Vision System For Reliable Eye Tracking}, author = {A. Haro and I. Essa and M. Flickner}, editor = {G. Szwillus and T. Turner}, doi = {10.1145/633292.633385}, year = {2000}, date = {2000-04-01}, booktitle = {ACM CHI Conference on Human factors in Computing Systems}, pages = {167--168}, publisher = {ACM press}, organization = {ACM}, abstract = {Knowing what the user is attending to and what they are looking at is essential for creating attentive user interfaces. Towards this end, we are building a reliable, real-time, non-invasive eye tracker using computer vision. Our system can robustly locate and track eyes without any calibration, and estimate the user's focus of attention. We have built several higher-level processes on top of this tracking system and have done some user studies to test the viability of our approach.}, keywords = {eye-tracking}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{2000-Haro-DTEUTPPDA, title = {Detecting and Tracking Eyes By Using Their Physiological Properties, Dynamics, and Appearance}, author = {A. Haro and M. Flickner and I. Essa}, editor = {J. Ponce and J. Malik and D. Kriegman and D. Forsyth}, year = {2000}, date = {2000-01-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, volume = {1}, pages = {163--168}, publisher = {IEEE}, address = {Hilton Head Island, SC, USA}, organization = {IEEE}, keywords = {computer vision, CVPR}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1999-Stillman-STRMPWMC, title = {A system for tracking and recognizing multiple people with multiple cameras}, author = {S. Stillman and R. Tanawongsuwan and I. Essa}, year = {1999}, date = {1999-03-01}, booktitle = {Proceedings of Audio and Vision-based Person Authentication (AVBPA)}, address = {Washington, DC, USA}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @techreport{1999-Tunawongsuwan-RTPMRA, title = {Robust Tracking of People by a Mobile Robotic Agent}, author = {R. Tunawongsuwan and A. Stoychev and I. Essa}, year = {1999}, date = {1999-01-01}, number = {99-19}, institution = {Georgia Institute of Technology, GVU Center}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } @inproceedings{1999-Schodl-APMHT, title = {Adaptive Parallelization of Model-Based Head Tracking.}, author = {A. Schödl and K. Schwan and I. Essa}, year = {1999}, date = {1999-01-01}, booktitle = {Proceedings of International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA)}, pages = {1571-1577}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1999-Brostow-MBDV, title = {Motion Based Decompositing of Video}, author = {G. J. Brostow and I. Essa}, url = {http://www.cc.gatech.edu/cpl/projects/layering/}, doi = {http://dx.doi.org/10.1109/ICCV.1999.791190}, year = {1999}, date = {1999-01-01}, booktitle = {IEEE International Conference on Computer Vision (ICCV)}, volume = {1}, pages = {8--13}, publisher = {IEEE Computer Society}, keywords = {computer vision, ICCV}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1999-Moore-EHAOCRT, title = {Exploiting Human Actions and Object Context for Recognition Tasks}, author = {D. Moore and I. Essa and M. Hayes}, url = {https://ieeexplore.ieee.org/document/791201 }, doi = {10.1109/ICCV.1999.791201}, year = {1999}, date = {1999-01-01}, urldate = {1999-01-01}, booktitle = {IEEE International Conference on Computer Vision (ICCV)}, pages = {80--86}, address = {Corfu, Greece}, organization = {IEEE Computer Society}, abstract = {Our goal is to exploit human motion and object context to perform action recognition and object classification. Towards this end, we introduce a framework for recognizing actions and objects by measuring image-, object- and action-based information from video. Hidden Markov models are combined with object context to classify hand actions, which are aggregated by a Bayesian classifier to summarize activities. We also use Bayesian methods to differentiate the class of unknown objects by evaluating detected actions along with low-level, extracted object features. Our approach is appropriate for locating and classifying objects under a variety of conditions including full occlusion. We show experiments where both familiar and previously unseen objects are recognized using action and context information. }, keywords = {activity recognition, computer vision, ICCV}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1999-Moore-CMHAR, title = {Context Management for Human Activity Recognition}, author = {D. Moore and I. Essa and M. Hayes}, year = {1999}, date = {1999-01-01}, booktitle = {Audio and Vision-based Person Authentication (AVBPA)}, address = {Washington, DC, USA}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1999-Kidd-AHLLUCR, title = {The Aware Home: A Living Laboratory for Ubiquitous Computing Research.}, author = {Cory Kidd and Rob Orr and Gregory Abowd and Chris Atkeson and Irfan Essa and Blair MacIntyre and Elizabeth Mynatt and Thad Starner and Wendy Newstetter}, url = {https://link.springer.com/chapter/10.1007/10705432_17 https://www.cc.gatech.edu/fce/ahri/publications/cobuild99_final.PDF}, doi = {10.1007/10705432_17}, year = {1999}, date = {1999-01-01}, urldate = {1999-01-01}, booktitle = {Proceedings of Conference on Cooperative Buildings (CoBuild) [Cooperative Buildings. Integrating Information, Organizations and Architecture]}, pages = {191-198}, publisher = {Springer Berlin / Heidelberg Springer Berlin / Heidelberg}, abstract = {We are building a home, called the Aware Home, to create a living laboratory for research in ubiquitous computing for everyday activities. This paper introduces the Aware Home project and outlines some of our technology-and human-centered research objectives in creating the Aware Home. }, keywords = {aging-in-place, computational health, intelligent environments}, pubstate = {published}, tppubtype = {inproceedings} } @article{1999-Essa-CSP, title = {Computers Seeing People}, author = {I. Essa}, year = {1999}, date = {1999-00-01}, journal = {AI Magazine}, volume = {20}, number = {1}, pages = {69--82}, keywords = {}, pubstate = {published}, tppubtype = {article} } @inproceedings{1998-Schodl-HTUTPM, title = {Head Tracking using a Textured Polygonal Model}, author = {A. Schödl and A. Haro and I. Essa}, year = {1998}, date = {1998-01-01}, booktitle = {Workshop on Perceptual User Interfaces (PUI)}, address = {San Francisco, CA. USA}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @techreport{1998-Moore-CMHAR, title = {Context Management for Human Activity Recognition}, author = {D. Moore and I. Essa and M. Hayes}, year = {1998}, date = {1998-01-01}, number = {GIT-GVU-98-26}, institution = {Georgia Institute of Technology, Graphics, Visualization, and Usability Center}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } @inproceedings{1997-Gardner-PASAD, title = {Prosody Analysis for Speaker Affect Determination}, author = {A. Gardner and I. Essa}, year = {1997}, date = {1997-10-01}, booktitle = {Workshop on Perceptual User Interfaces (PUI)}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1997-Essa-CPFCE, title = {Computational Perception in Future Computing Environments}, author = {Irfan Essa and Gregory Abowd and Chris Atkeson}, url = {https://www.cc.gatech.edu/fce/pubs/pui97-fce.html}, year = {1997}, date = {1997-01-01}, urldate = {1997-01-01}, booktitle = {Workshop on Perceptual User Interfaces (PUI)}, keywords = {aging-in-place, computational health, intelligent environments}, pubstate = {published}, tppubtype = {inproceedings} } @article{1997-Essa-CAIRFE, title = {Coding, Analysis, Interpretation, and Recognition of Facial Expressions}, author = {I. Essa and A. Pentland}, year = {1997}, date = {1997-01-01}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)}, volume = {19}, number = {7}, pages = {757--763}, keywords = {}, pubstate = {published}, tppubtype = {article} } @incollection{1997-Essa-MMRIFE, title = {Motion and Model-based Recognition and Interpretation of Facial Expressions}, author = {I. Essa and A. Pentland}, editor = {M. Shah and R. Jain}, year = {1997}, date = {1997-01-01}, booktitle = {Motion-Based Recognition}, publisher = {Kluwer Academic Publishers}, series = {Computational Imaging and Vision Series}, keywords = {}, pubstate = {published}, tppubtype = {incollection} } @inproceedings{1996-Basu-MRMHT, title = {Motion Regularization for Model-based Head Tracking}, author = {Sumit Basu and Irfan Essa and Alex Pentland}, url = {https://ieeexplore.ieee.org/document/547019}, doi = {10.1109/ICPR.1996.547019}, isbn = {0-8186-7282-X}, year = {1996}, date = {1996-10-01}, urldate = {1996-10-01}, booktitle = {Proceedings of International Conference on Pattern Recognition (ICPR)}, abstract = {This paper describes a method for the robust tracking of rigid head motion from video. This method uses a 3D ellipsoidal model of the head and interprets the optical flow in terms of the possible rigid motions of the model. This method is robust to large angular and translational motions of the head and is not subject to the singularities of a 2D model. The method has been successfully applied to heads with a variety of shapes, hair styles, etc. This method also has the advantage of accurately capturing the 3D motion parameters of the head. This accuracy is shown through comparison with a ground truth synthetic sequence (a rendered 3D animation of a model head). In addition, the ellipsoidal model is robust to small variations in the initial fit, enabling the automation of the model initialization. Lastly, due to its consideration of the entire 3D aspect of the head, the tracking is very stable over a large number of frames. This robustness extends even to sequences with very low frame rates and noisy camera images. }, keywords = {computer vision, face & gesture, ICPR}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1996-Essa-MTIAFHUIFV, title = {Modeling, Tracking and Interactive Animation of Faces and Heads using Input from Video}, author = {Irfan Essa and Sumit Basu and Trevor Darrell and Alex Pentland}, url = {https://ieeexplore.ieee.org/abstract/document/540489}, doi = {10.1109/CA.1996.540489}, isbn = {0-8186-7588-8}, year = {1996}, date = {1996-06-01}, urldate = {1996-06-01}, booktitle = {Computer Animation Conference}, pages = {68--79}, publisher = {IEEE Computer Society Press}, abstract = {We describe tools that use measurements from video for the extraction of facial modeling and animation parameters, head tracking, and real time interactive facial animation. These tools share common goals but rely on varying details of physical and geometric modeling and in their input measurement system. Accurate facial modeling involves fine details of geometry and muscle coarticulation. By coupling pixel by pixel measurements of surface motion to a physically based face model and a muscle control model, we have been able to obtain detailed spatio temporal records of both the displacement of each point on the facial surface and the muscle control required to produce the observed facial motion. We discuss the importance of this visually extracted representation in terms of realistic facial motion synthesis. A similar method that uses an ellipsoidal model of the head coupled with detailed estimates of visual … }, keywords = {computer vision, face & gesture}, pubstate = {published}, tppubtype = {inproceedings} } @article{1996-Darrell-TGMUIV, title = {Task-specific Gesture Modeling using Interpolated Views}, author = {T. Darrell and I. Essa and A. Pentland}, year = {1996}, date = {1996-01-01}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)}, volume = {18}, number = {12}, keywords = {}, pubstate = {published}, tppubtype = {article} } @inproceedings{1995-Brand-CAVGU, title = {Causal Analysis for Visual Gesture Understanding}, author = {M. Brand and I. Essa}, year = {1995}, date = {1995-10-01}, booktitle = {AAAI Fall Symposium on Computational Models for Integrating Language and Vision}, abstract = {We are exploring the use of high-level knowledge about bodies in the visual understanding of gesture. Our hypothesis is that many gestures are metaphorically derived from the motor programs of our everyday interactions with objects and people. For example, many dismissive gestures look like an imaginary object is being brushed or tossed away. At the discourse level, this implicit mass represents a referent in the conversation; at the scene-formation level, the dismissive gesture obeys many of the kinematic and dynamic constraints that would shape an actual tossing. Thus this metaphor provides us with constraints for both discourse annotation and visual processing. In this paper we present some preliminary results interpreting complex gesture sequences in video.}, keywords = {gesture recognition}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1995-Darrell-AEGAIE, title = {Attention-driven Expression and Gesture Analysis in an Interactive Environment}, author = {T. Darrell and I. Essa and A. Pentland}, year = {1995}, date = {1995-01-01}, booktitle = {Proceedings of International Workshop on Automatic Face and Gesture Recogntion (FG)}, pages = {135--140}, publisher = {Editor, M. Bichsel}, address = {Zurich, Switzerland}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1995-Essa-FERUDMME, title = {Facial Expression Recognition using a Dynamic Model and Motion Energy}, author = {I. Essa and A. Pentland}, year = {1995}, date = {1995-01-01}, booktitle = {IEEE International Conference on Computer Vision (ICCV)}, pages = {360--367}, publisher = {IEEE Computer Society}, address = {Cambridge, MA}, keywords = {computer vision, ICCV}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1994-Darrell-CINREA, title = {Correlation and Interpolation Networks for Real-Time Expression Analysis/Synthesis}, author = {T. Darrell and I. Essa and A. Pentland}, editor = {G. Tesauro and D. S. Touretzky and T. K. Leen}, url = {https://papers.nips.cc/paper/999-correlation-and-interpolation-networks-for-real-time-expression-analysissynthesis}, year = {1994}, date = {1994-12-01}, booktitle = {Advances in Neural Information Processing Systems (NeurIPS)}, volume = {7}, publisher = {MIT Press}, abstract = {We describe a framework for real-time tracking of facial expressions that uses neurally-inspired correlation and interpolation methods. A distributed view-based representation is used to characterize facial state, and is computed using a replicated correlation network. The ensemble response of the set of view correlation scores is input to a network based interpolation method, which maps perceptual state to motor control states for a simulated 3-D face model. Activation levels of the motor state correspond to muscle activations in an anatomically derived model. By integrating fast and robust 2-D processing with 3-D models, we obtain a system that is able to quickly track and interpret complex facial motions in real-time.}, keywords = {face & gesture, face processing, gesture recognition}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1994-Essa-VSOEFAP, title = {A Vision System for Observing and Extracting Facial Action Parameters}, author = {I. Essa and A. Pentland}, url = {https://ieeexplore.ieee.org/document/323813}, doi = {10.1109/CVPR.1994.323813}, year = {1994}, date = {1994-01-01}, urldate = {1994-01-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, pages = {76--83}, publisher = {IEEE Computer Society}, abstract = {We describe a computer vision system for observing the "action units" of a face using video sequences as input. The visual observation (sensing) is achieved by using an optimal estimation optical flow method coupled with a geometric and a physical (muscle) model describing the facial structure. This modeling results in a time-varying spatial patterning of facial shape and a parametric representation of the independent muscle action groups, responsible for the observed facial motions. These muscle action patterns may then be used for analysis, interpretation, and synthesis. Thus, by interpreting facial motions within a physics-based optimal estimation framework, a new control model of facial movement is developed. The newly extracted action units (which we name "FACS+") are both physics and geometry-based, and extend the well-known FACS parameters for facial expressions by adding temporal information and non-local spatial patterning of facial motion.< > }, keywords = {computer vision, CVPR, face & gesture, face processing}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1994-Essa-TFM, title = {Tracking Facial Motion}, author = {I. Essa and T. Darrell and A. Pentland}, year = {1994}, date = {1994-01-01}, booktitle = {Proceedings of IEEE Workshop on Motion of Nonrigid and Articulated Objects}, pages = {36--42}, publisher = {IEEE Computer Society}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @phdthesis{1994-Essa-AISFE, title = {Analysis, Interpretation, and Synthesis of Facial Expressions}, author = {I. Essa}, url = {https://dspace.mit.edu/handle/1721.1/29086}, year = {1994}, date = {1994-01-01}, address = {MIT Media Laboratory, Cambridge, MA 02139, USA}, school = {Massachusetts Institute of Technology}, abstract = {This thesis describes a computer vision system for observing the "action units" of a face using video sequences as input. The visual observation (sensing) is achieved by using an optimal estimation optical flow method coupled with a geometric and a physical (muscle) model describing the facial structure. This modeling results in a time-varying spatial patterning of facial shape and a parametric representation of the independent muscle action groups responsible for the observed facial motions. These muscle action patterns are then used for analysis, interpretation, recognition, and synthesis of facial expressions. Thus, by interpreting facial motions within a physics-based optimal estimation framework, a new control model of facial movement is developed. The newly extracted action units (which we name "FACS+") are both physics and geometry-based, and extend the well known FACS parameters for facial expressions by adding temporal information and non-local spatial patterning of facial motion.}, keywords = {}, pubstate = {published}, tppubtype = {phdthesis} } @inproceedings{1994-Pentland-VGA, title = {Visually guided animation}, author = {Alex Pentland and Trevor Darrell and Irfan Essa and Ali Azarbayejani and Stan Sclaroff}, year = {1994}, date = {1994-01-01}, urldate = {1994-01-01}, booktitle = {Computer Animation Conference}, pages = {76 - 83}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society}, keywords = {computer animation}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{1994-Pentland-VGIA, title = {Visually guided interaction and animation}, author = {Alex Pentland and Stan Sclaroff and Trevor Darrell and Irfan Essa and Ali Azarbayejani and Thad Starner}, year = {1994}, date = {1994-01-01}, urldate = {1994-01-01}, booktitle = {Asilomar Conference on Signals, Systems, and Computers}, volume = {2}, pages = {1287 - 1291}, address = {Pacific Grove, CA}, keywords = {computer animation, human-computer interaction, multimodal interfaces}, pubstate = {published}, tppubtype = {inproceedings} } @incollection{1993-Essa-PMGV, title = {Physically-based Modeling for Graphics and Vision}, author = {I. Essa and S. Sclaroff and A. Pentland}, editor = {Ralph Martin}, year = {1993}, date = {1993-01-01}, booktitle = {Directions in Geometric Computing}, pages = {160--196}, publisher = {Information Geometers, U.K.}, keywords = {}, pubstate = {published}, tppubtype = {incollection} } @article{1992-Essa-UAPGMGA, title = {A Unified Approach for Physical and Geometric Modeling for Graphics and Animation}, author = {I. Essa and S. Sclaroff and A. Pentland}, year = {1992}, date = {1992-05-01}, journal = {Computer Graphics Forum, The International Journal of the Eurographics Association}, volume = {11}, number = {3}, pages = {129-138}, keywords = {}, pubstate = {published}, tppubtype = {article} } @inproceedings{1992-Sclaroff-VMAUAPGMGA, title = {Vision-based Modeling: An application of a Unified Approach for Physical and Geometric Modeling for Graphics and Animation}, author = {S. Sclaroff and I. Essa and A. Pentland}, year = {1992}, date = {1992-01-01}, booktitle = {Proceedings of Eurographics Workshop on Animations and Simulations}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @techreport{1991-Essa-EPDSRLE, title = {Estimated Physics: Dynamic Simulation in a Resource Limited Environment}, author = {I. Essa and A. Pentland}, year = {1991}, date = {1991-01-01}, number = {182}, institution = {M.I.T. Media Laboratory, Vision and Modeling Group}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } @mastersthesis{1990-Essa-CDCFFPBVWM, title = {Contact Detection, Collision Forces and Friction for Physically Based Virtual World Modeling}, author = {I. Essa}, url = {https://dspace.mit.edu/handle/1721.1/14054}, year = {1990}, date = {1990-06-01}, urldate = {1990-06-01}, school = {Massachusetts Institute Technology}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } @mastersthesis{1990-Essa-CDCFFPBVWMb, title = {Contact Detection, Collision Forces and Friction for Physically Based Virtual World Modeling}, author = {I. Essa}, url = {https://dspace.mit.edu/handle/1721.1/14054}, year = {1990}, date = {1990-06-01}, school = {Massachusetts Institute Technology}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } @article{1990-Pentland-TMSVSMF, title = {The Thingworld Modeling System: Virtual Sculpting by Modal Forces}, author = {A. Pentland and I. Essa and M. Friedmann and B. Horowitz and S. E. Sclaroff}, doi = {10.1145/91394.91434}, year = {1990}, date = {1990-03-01}, urldate = {1990-03-01}, journal = {ACM SIGGRAPH Proceedings of Symposium on Interactive 3D Graphics (I3DG)}, volume = {24}, number = {2}, pages = {143--144}, keywords = {ACM, computer animation, computer graphics, I3DG, physcially-based modeling, SIGGRAPH}, pubstate = {published}, tppubtype = {article} }