Some recent publications for 2023
Here is a list of some recent works accepted for publication that I am honored to be part of. These will be appearing in CHI, ICLR, and CVPR. Excited to share these new efforts.
Kihyuk Sohn, Nataniel Ruiz, Kimin Lee, Daniel Castro Chin, Irina Blok, Huiwen Chang, Jarred Barber, Lu Jiang, Glenn Entis, Yuanzhen Li, Yuan Hao, Irfan Essa, Michael Rubinstein, Dilip Krishnan
StyleDrop: Text-to-Image Generation in Any Style Proceedings Article
In: Advances in Neural Information Processing Systems (NeurIPS), 2023.
Abstract | Links | BibTeX | Tags: arXiv, computer vision, generative AI, google, NeurIPS
@inproceedings{2023-Sohn-STGS,
title = {StyleDrop: Text-to-Image Generation in Any Style},
author = {Kihyuk Sohn and Nataniel Ruiz and Kimin Lee and Daniel Castro Chin and Irina Blok and Huiwen Chang and Jarred Barber and Lu Jiang and Glenn Entis and Yuanzhen Li and Yuan Hao and Irfan Essa and Michael Rubinstein and Dilip Krishnan},
url = {https://arxiv.org/abs/2306.00983
https://openreview.net/forum?id=KoaFh16uOc
https://proceedings.neurips.cc/paper_files/paper/2023/hash/d33b177b69425e7685b0b1c05bd2a5e4-Abstract-Conference.html},
doi = {10.48550/arXiv.2306.00983},
year = {2023},
date = {2023-12-11},
urldate = {2023-12-11},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
abstract = {Pre-trained large text-to-image models synthesize impressive images with an appropriate use of text prompts. However, ambiguities inherent in natural language and out-of-distribution effects make it hard to synthesize image styles, that leverage a specific design pattern, texture or material. In this paper, we introduce StyleDrop, a method that enables the synthesis of images that faithfully follow a specific style using a text-to-image model. The proposed method is extremely versatile and captures nuances and details of a user-provided style, such as color schemes, shading, design patterns, and local and global effects. It efficiently learns a new style by fine-tuning very few trainable parameters (less than 1% of total model parameters) and improving the quality via iterative training with either human or automated feedback. Better yet, StyleDrop is able to deliver impressive results even when the user supplies only a single image that specifies the desired style. An extensive study shows that, for the task of style tuning text-to-image models, StyleDrop implemented on Muse convincingly outperforms other methods, including DreamBooth and textual inversion on Imagen or Stable Diffusion. More results are available at our project website: this https URL},
howpublished = {arXiv:2306.00983},
keywords = {arXiv, computer vision, generative AI, google, NeurIPS},
pubstate = {published},
tppubtype = {inproceedings}
}
Lijun Yu, Yong Cheng, Zhiruo Wang, Vivek Kumar, Wolfgang Macherey, Yanping Huang, David A. Ross, Irfan Essa, Yonatan Bisk, Ming-Hsuan Yang, Kevin Murphy, Alexander G. Hauptmann, Lu Jiang
SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs Proceedings Article
In: Advances in Neural Information Processing Systems (NeurIPS), 2023.
Abstract | Links | BibTeX | Tags: arXiv, computational video, computer vision, generative AI, NeurIPS
@inproceedings{2023-Yu-SSPAMGWFL,
title = {SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs},
author = {Lijun Yu and Yong Cheng and Zhiruo Wang and Vivek Kumar and Wolfgang Macherey and Yanping Huang and David A. Ross and Irfan Essa and Yonatan Bisk and Ming-Hsuan Yang and Kevin Murphy and Alexander G. Hauptmann and Lu Jiang},
url = {https://arxiv.org/abs/2306.17842
https://openreview.net/forum?id=CXPUg86A1D
https://proceedings.neurips.cc/paper_files/paper/2023/hash/a526cc8f6ffb74bedb6ff313e3fdb450-Abstract-Conference.html},
doi = {10.48550/arXiv.2306.17842},
year = {2023},
date = {2023-12-11},
urldate = {2023-12-11},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
abstract = {In this work, we introduce Semantic Pyramid AutoEncoder (SPAE) for enabling frozen LLMs to perform both understanding and generation tasks involving non-linguistic modalities such as images or videos. SPAE converts between raw pixels and interpretable lexical tokens (or words) extracted from the LLM's vocabulary. The resulting tokens capture both the semantic meaning and the fine-grained details needed for visual reconstruction, effectively translating the visual content into a language comprehensible to the LLM, and empowering it to perform a wide array of multimodal tasks. Our approach is validated through in-context learning experiments with frozen PaLM 2 and GPT 3.5 on a diverse set of image understanding and generation tasks. Our method marks the first successful attempt to enable a frozen LLM to generate image content while surpassing state-of-the-art performance in image understanding tasks, under the same setting, by over 25%.},
howpublished = {Advances in Neural Information Processing Systems (NeurIPS) (arXiv:2306.17842v2)},
keywords = {arXiv, computational video, computer vision, generative AI, NeurIPS},
pubstate = {published},
tppubtype = {inproceedings}
}
Nikolai Warner, Meera Hahn, Jonathan Huang, Irfan Essa, Vighnesh Birodkar
Text and Click inputs for unambiguous open vocabulary instance segmentation Proceedings Article
In: Proeedings of British Conference for Machine Vision (BMVC), 2023.
Abstract | Links | BibTeX | Tags: arXiv, BMVC, computer vision, google, image segmentation
@inproceedings{2023-Warner-TACIFUOVIS,
title = {Text and Click inputs for unambiguous open vocabulary instance segmentation},
author = {Nikolai Warner and Meera Hahn and Jonathan Huang and Irfan Essa and Vighnesh Birodkar},
url = {https://doi.org/10.48550/arXiv.2311.14822
https://arxiv.org/abs/2311.14822
https://arxiv.org/pdf/2311.14822.pdf},
doi = {arXiv.2311.14822},
year = {2023},
date = {2023-11-24},
urldate = {2023-11-24},
booktitle = {Proeedings of British Conference for Machine Vision (BMVC)},
abstract = {Segmentation localizes objects in an image on a fine-grained per-pixel scale. Segmentation benefits by humans-in-the-loop to provide additional input of objects to segment using a combination of foreground or background clicks. Tasks include photoediting or novel dataset annotation, where human annotators leverage an existing segmentation model instead of drawing raw pixel level annotations. We propose a new segmentation process, Text + Click segmentation, where a model takes as input an image, a text phrase describing a class to segment, and a single foreground click specifying the instance to segment. Compared to previous approaches, we leverage open-vocabulary image-text models to support a wide-range of text prompts. Conditioning segmentations on text prompts improves the accuracy of segmentations on novel or unseen classes. We demonstrate that the combination of a single user-specified foreground click and a text prompt allows a model to better disambiguate overlapping or co-occurring semantic categories, such as "tie", "suit", and "person". We study these results across common segmentation datasets such as refCOCO, COCO, VOC, and OpenImages. Source code available here.
},
keywords = {arXiv, BMVC, computer vision, google, image segmentation},
pubstate = {published},
tppubtype = {inproceedings}
}
K. Niranjan Kumar, Irfan Essa, Sehoon Ha
Words into Action: Learning Diverse Humanoid Robot Behaviors using Language Guided Iterative Motion Refinement Proceedings Article
In: CoRL Workshop on Language and Robot Learning Language as Grounding (with CoRL 2023), 2023.
Abstract | Links | BibTeX | Tags: arXiv, CoRL, robotics, vision & language
@inproceedings{2023-Kumar-WIALDHRBULGIM,
title = {Words into Action: Learning Diverse Humanoid Robot Behaviors using Language Guided Iterative Motion Refinement},
author = {K. Niranjan Kumar and Irfan Essa and Sehoon Ha},
url = {https://doi.org/10.48550/arXiv.2310.06226
https://arxiv.org/abs/2310.06226
https://arxiv.org/pdf/2310.06226.pdf
https://www.kniranjankumar.com/words_into_action/
},
doi = {10.48550/arXiv.2310.06226},
year = {2023},
date = {2023-11-01},
urldate = {2023-11-01},
booktitle = {CoRL Workshop on Language and Robot Learning Language as Grounding (with CoRL 2023)},
abstract = {We present a method to simplify controller design by enabling users to train and fine-tune robot control policies using natural language commands. We first learn a neural network policy that generates behaviors given a natural language command, such as “walk forward”, by combining Large Language Models (LLMs), motion retargeting, and motion imitation. Based on the synthesized motion, we iteratively fine-tune by updating the text prompt and querying LLMs to find the best checkpoint associated with the closest motion in history.},
keywords = {arXiv, CoRL, robotics, vision & language},
pubstate = {published},
tppubtype = {inproceedings}
}
K. Niranjan Kumar, Irfan Essa, Sehoon Ha
Cascaded Compositional Residual Learning for Complex Interactive Behaviors Journal Article
In: IEEE Robotics and Automation Letters, vol. 8, iss. 8, pp. 4601–4608, 2023.
Abstract | Links | BibTeX | Tags: IEEE, reinforcement learning, robotics
@article{2023-Kumar-CCRLCIB,
title = {Cascaded Compositional Residual Learning for Complex Interactive Behaviors},
author = {K. Niranjan Kumar and Irfan Essa and Sehoon Ha},
url = {https://ieeexplore.ieee.org/document/10152471},
doi = {10.1109/LRA.2023.3286171},
year = {2023},
date = {2023-06-14},
urldate = {2023-06-14},
journal = {IEEE Robotics and Automation Letters},
volume = {8},
issue = {8},
pages = {4601--4608},
abstract = {Real-world autonomous missions often require rich interaction with nearby objects, such as doors or switches, along with effective navigation. However, such complex behaviors are difficult to learn because they involve both high-level planning and low-level motor control. We present a novel framework, Cascaded Compositional Residual Learning (CCRL), which learns composite skills by recursively leveraging a library of previously learned control policies. Our framework combines multiple levels of pre-learned skills by using multiplicative skill composition and residual action learning. We also introduce a goal synthesis network and an observation selector to support combination of heterogeneous skills, each with its unique goals and observation space. Finally, we develop residual regularization for learning policies that solve a new task, while preserving the style of the motion enforced by the skill library. We show that our framework learns joint-level control policies for a diverse set of motor skills ranging from basic locomotion to complex interactive navigation, including navigating around obstacles, pushing objects, crawling under a table, pushing a door open with its leg, and holding it open while walking through it. The proposed CCRL framework leads to policies with consistent styles and lower joint torques, and successfully transfer to a real Unitree A1 robot without any additional fine-tuning.},
keywords = {IEEE, reinforcement learning, robotics},
pubstate = {published},
tppubtype = {article}
}
Dina Bashkirova, José Lezama, Kihyuk Sohn, Kate Saenko, Irfan Essa
MaskSketch: Unpaired Structure-guided Masked Image Generation Proceedings Article
In: IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2023.
Abstract | Links | BibTeX | Tags: computer vision, CVPR, generative AI, generative media, google
@inproceedings{2023-Bashkirova-MUSMIG,
title = {MaskSketch: Unpaired Structure-guided Masked Image Generation},
author = { Dina Bashkirova and José Lezama and Kihyuk Sohn and Kate Saenko and Irfan Essa},
url = {https://arxiv.org/abs/2302.05496
https://openaccess.thecvf.com/content/CVPR2023/papers/Bashkirova_MaskSketch_Unpaired_Structure-Guided_Masked_Image_Generation_CVPR_2023_paper.pdf
https://openaccess.thecvf.com/content/CVPR2023/supplemental/Bashkirova_MaskSketch_Unpaired_Structure-Guided_CVPR_2023_supplemental.pdf},
doi = {10.48550/ARXIV.2302.05496},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
booktitle = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Recent conditional image generation methods produce images of remarkable diversity, fidelity and realism. However, the majority of these methods allow conditioning only on labels or text prompts, which limits their level of control over the generation result. In this paper, we introduce MaskSketch, an image generation method that allows spatial conditioning of the generation result using a guiding sketch as an extra conditioning signal during sampling. MaskSketch utilizes a pre-trained masked generative transformer, requiring no model training or paired supervision, and works with input sketches of different levels of abstraction. We show that intermediate self-attention maps of a masked generative transformer encode important structural information of the input image, such as scene layout and object shape, and we propose a novel sampling method based on this observation to enable structure-guided generation. Our results show that MaskSketch achieves high image realism and fidelity to the guiding structure. Evaluated on standard benchmark datasets, MaskSketch outperforms state-of-the-art methods for sketch-to-image translation, as well as unpaired image-to-image translation approaches.},
keywords = {computer vision, CVPR, generative AI, generative media, google},
pubstate = {published},
tppubtype = {inproceedings}
}
Lijun Yu, Yong Cheng, Kihyuk Sohn, José Lezama, Han Zhang, Huiwen Chang, Alexander G. Hauptmann, Ming-Hsuan Yang, Yuan Hao, Irfan Essa, Lu Jiang
MAGVIT: Masked Generative Video Transformer Proceedings Article
In: IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2023.
Abstract | Links | BibTeX | Tags: computational video, computer vision, CVPR, generative AI, generative media, google
@inproceedings{2023-Yu-MMGVT,
title = {MAGVIT: Masked Generative Video Transformer},
author = {Lijun Yu and Yong Cheng and Kihyuk Sohn and José Lezama and Han Zhang and Huiwen Chang and Alexander G. Hauptmann and Ming-Hsuan Yang and Yuan Hao and Irfan Essa and Lu Jiang},
url = {https://arxiv.org/abs/2212.05199
https://magvit.cs.cmu.edu/
https://openaccess.thecvf.com/content/CVPR2023/papers/Yu_MAGVIT_Masked_Generative_Video_Transformer_CVPR_2023_paper.pdf
https://openaccess.thecvf.com/content/CVPR2023/supplemental/Yu_MAGVIT_Masked_Generative_CVPR_2023_supplemental.pdf},
doi = {10.48550/ARXIV.2212.05199},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
booktitle = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {We introduce the MAsked Generative VIdeo Transformer, MAGVIT, to tackle various video synthesis tasks with a single model. We introduce a 3D tokenizer to quantize a video into spatial-temporal visual tokens and propose an embedding method for masked video token modeling to facilitate multi-task learning. We conduct extensive experiments to demonstrate the quality, efficiency, and flexibility of MAGVIT. Our experiments show that (i) MAGVIT performs favorably against state-of-the-art approaches and establishes the best-published FVD on three video generation benchmarks, including the challenging Kinetics-600. (ii) MAGVIT outperforms existing methods in inference time by two orders of magnitude against diffusion models and by 60x against autoregressive models. (iii) A single MAGVIT model supports ten diverse generation tasks and generalizes across videos from different visual domains. The source code and trained models will be released to the public at this https URL.},
keywords = {computational video, computer vision, CVPR, generative AI, generative media, google},
pubstate = {published},
tppubtype = {inproceedings}
}
Kihyuk Sohn, Yuan Hao, José Lezama, Luisa Polania, Huiwen Chang, Han Zhang, Irfan Essa, Lu Jiang
Visual Prompt Tuning for Generative Transfer Learning Proceedings Article
In: IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2023.
Abstract | Links | BibTeX | Tags: computer vision, CVPR, generative AI, generative media, google
@inproceedings{2022-Sohn-VPTGTL,
title = {Visual Prompt Tuning for Generative Transfer Learning},
author = {Kihyuk Sohn and Yuan Hao and José Lezama and Luisa Polania and Huiwen Chang and Han Zhang and Irfan Essa and Lu Jiang},
url = {https://arxiv.org/abs/2210.00990
https://openaccess.thecvf.com/content/CVPR2023/papers/Sohn_Visual_Prompt_Tuning_for_Generative_Transfer_Learning_CVPR_2023_paper.pdf
https://openaccess.thecvf.com/content/CVPR2023/supplemental/Sohn_Visual_Prompt_Tuning_CVPR_2023_supplemental.pdf},
doi = {10.48550/ARXIV.2210.00990},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
booktitle = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Transferring knowledge from an image synthesis model trained on a large dataset is a promising direction for learning generative image models from various domains efficiently. While previous works have studied GAN models, we present a recipe for learning vision transformers by generative knowledge transfer. We base our framework on state-of-the-art generative vision transformers that represent an image as a sequence of visual tokens to the autoregressive or non-autoregressive transformers. To adapt to a new domain, we employ prompt tuning, which prepends learnable tokens called prompt to the image token sequence, and introduce a new prompt design for our task. We study on a variety of visual domains, including visual task adaptation benchmark~citezhai2019large, with varying amount of training images, and show effectiveness of knowledge transfer and a significantly better image generation quality over existing works.},
keywords = {computer vision, CVPR, generative AI, generative media, google},
pubstate = {published},
tppubtype = {inproceedings}
}
Kihyuk Sohn, Albert Shaw, Yuan Hao, Han Zhang, Luisa Polania, Huiwen Chang, Lu Jiang, Irfan Essa
Learning Disentangled Prompts for Compositional Image Synthesis Technical Report
2023.
Abstract | Links | BibTeX | Tags: arXiv, computer vision, generative AI, google, prompt engineering
@techreport{2023-Sohn-LDPCIS,
title = {Learning Disentangled Prompts for Compositional Image Synthesis},
author = {Kihyuk Sohn and Albert Shaw and Yuan Hao and Han Zhang and Luisa Polania and Huiwen Chang and Lu Jiang and Irfan Essa},
url = {https://arxiv.org/abs/2306.00763},
doi = { https://doi.org/10.48550/arXiv.2306.00763},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
abstract = {We study domain-adaptive image synthesis, the problem of teaching pretrained image generative models a new style or concept from as few as one image to synthesize novel images, to better understand the compositional image synthesis. We present a framework that leverages a pre-trained class-conditional generation model and visual prompt tuning. Specifically, we propose a novel source class distilled visual prompt that learns disentangled prompts of semantic (e.g., class) and domain (e.g., style) from a few images. Learned domain prompt is then used to synthesize images of any classes in the style of target domain. We conduct studies on various target domains with the number of images ranging from one to a few to many, and show qualitative results which show the compositional generalization of our method. Moreover, we show that our method can help improve zero-shot domain adaptation classification accuracy.
},
howpublished = {arXiv:2306.00763 },
keywords = {arXiv, computer vision, generative AI, google, prompt engineering},
pubstate = {published},
tppubtype = {techreport}
}
José Lezama, Tim Salimans, Lu Jiang, Huiwen Chang, Jonathan Ho, Irfan Essa
Discrete Predictor-Corrector Diffusion Models for Image Synthesis Proceedings Article
In: International Conference on Learning Representations (ICLR), 2023.
Abstract | Links | BibTeX | Tags: computer vision, generative AI, generative media, google, ICLR, machine learning
@inproceedings{2023-Lezama-DPDMIS,
title = {Discrete Predictor-Corrector Diffusion Models for Image Synthesis},
author = {José Lezama and Tim Salimans and Lu Jiang and Huiwen Chang and Jonathan Ho and Irfan Essa},
url = {https://openreview.net/forum?id=VM8batVBWvg},
year = {2023},
date = {2023-05-01},
urldate = {2023-05-01},
booktitle = {International Conference on Learning Representations (ICLR)},
abstract = {We introduce Discrete Predictor-Corrector diffusion models (DPC), extending predictor-corrector samplers in Gaussian diffusion models to the discrete case. Predictor-corrector samplers are a class of samplers for diffusion models, which improve on ancestral samplers by correcting the sampling distribution of intermediate diffusion states using MCMC methods. In DPC, the Langevin corrector, which does not have a direct counterpart in discrete space, is replaced with a discrete MCMC transition defined by a learned corrector kernel. The corrector kernel is trained to make the correction steps achieve asymptotic convergence, in distribution, to the correct marginal of the intermediate diffusion states. Equipped with DPC, we revisit recent transformer-based non-autoregressive generative models through the lens of discrete diffusion, and find that DPC can alleviate the compounding decoding error due to the parallel sampling of visual tokens. Our experiments show that DPC improves upon existing discrete latent space models for class-conditional image generation on ImageNet, and outperforms continuous diffusion models and GANs, according to standard metrics and user preference studies},
keywords = {computer vision, generative AI, generative media, google, ICLR, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Erik Wijmans, Manolis Savva, Irfan Essa, Stefan Lee, Ari S. Morcos, Dhruv Batra
Emergence of Maps in the Memories of Blind Navigation Agents Best Paper Proceedings Article
In: Proceedings of International Conference on Learning Representations (ICLR), 2023.
Abstract | Links | BibTeX | Tags: awards, best paper award, computer vision, google, ICLR, machine learning, robotics
@inproceedings{2023-Wijmans-EMMBNA,
title = {Emergence of Maps in the Memories of Blind Navigation Agents},
author = {Erik Wijmans and Manolis Savva and Irfan Essa and Stefan Lee and Ari S. Morcos and Dhruv Batra},
url = {https://arxiv.org/abs/2301.13261
https://wijmans.xyz/publication/eom/
https://openreview.net/forum?id=lTt4KjHSsyl
https://blog.iclr.cc/2023/03/21/announcing-the-iclr-2023-outstanding-paper-award-recipients/},
doi = {10.48550/ARXIV.2301.13261},
year = {2023},
date = {2023-05-01},
urldate = {2023-05-01},
booktitle = {Proceedings of International Conference on Learning Representations (ICLR)},
abstract = {Animal navigation research posits that organisms build and maintain internal spatial representations, or maps, of their environment. We ask if machines -- specifically, artificial intelligence (AI) navigation agents -- also build implicit (or 'mental') maps. A positive answer to this question would (a) explain the surprising phenomenon in recent literature of ostensibly map-free neural-networks achieving strong performance, and (b) strengthen the evidence of mapping as a fundamental mechanism for navigation by intelligent embodied agents, whether they be biological or artificial. Unlike animal navigation, we can judiciously design the agent's perceptual system and control the learning paradigm to nullify alternative navigation mechanisms. Specifically, we train 'blind' agents -- with sensing limited to only egomotion and no other sensing of any kind -- to perform PointGoal navigation ('go to Δ x, Δ y') via reinforcement learning. Our agents are composed of navigation-agnostic components (fully-connected and recurrent neural networks), and our experimental setup provides no inductive bias towards mapping. Despite these harsh conditions, we find that blind agents are (1) surprisingly effective navigators in new environments (~95% success); (2) they utilize memory over long horizons (remembering ~1,000 steps of past experience in an episode); (3) this memory enables them to exhibit intelligent behavior (following walls, detecting collisions, taking shortcuts); (4) there is emergence of maps and collision detection neurons in the representations of the environment built by a blind agent as it navigates; and (5) the emergent maps are selective and task dependent (e.g. the agent 'forgets' exploratory detours). Overall, this paper presents no new techniques for the AI audience, but a surprising finding, an insight, and an explanation.},
keywords = {awards, best paper award, computer vision, google, ICLR, machine learning, robotics},
pubstate = {published},
tppubtype = {inproceedings}
}
Yi-Hao Peng, Peggy Chi, Anjuli Kannan, Meredith Morris, Irfan Essa
Slide Gestalt: Automatic Structure Extraction in Slide Decks for Non-Visual Access Proceedings Article
In: ACM Symposium on User Interface Software and Technology (UIST), 2023.
Abstract | Links | BibTeX | Tags: accessibility, CHI, google, human-computer interaction
@inproceedings{2023-Peng-SGASESDNA,
title = {Slide Gestalt: Automatic Structure Extraction in Slide Decks for Non-Visual Access},
author = {Yi-Hao Peng and Peggy Chi and Anjuli Kannan and Meredith Morris and Irfan Essa},
url = {https://research.google/pubs/pub52182/
https://dl.acm.org/doi/fullHtml/10.1145/3544548.3580921
https://doi.org/10.1145/3544548.3580921
https://www.youtube.com/watch?v=pK08aMRx4qo},
year = {2023},
date = {2023-04-23},
urldate = {2023-04-23},
booktitle = {ACM Symposium on User Interface Software and Technology (UIST)},
abstract = {Presentation slides commonly use visual patterns for structural navigation, such as titles, dividers, and build slides. However, screen readers do not capture such intention, making it time-consuming and less accessible for blind and visually impaired (BVI) users to linearly consume slides with repeated content. We present Slide Gestalt, an automatic approach that identifies the hierarchical structure in a slide deck. Slide Gestalt computes the visual and textual correspondences between slides to generate hierarchical groupings. Readers can navigate the slide deck from the higher-level section overview to the lower-level description of a slide group or individual elements interactively with our UI. We derived side consumption and authoring practices from interviews with BVI readers and sighted creators and an analysis of 100 decks. We performed our pipeline with 50 real-world slide decks and a large dataset. Feedback from eight BVI participants showed that Slide Gestalt helped navigate a slide deck by anchoring content more efficiently, compared to using accessible slides.},
keywords = {accessibility, CHI, google, human-computer interaction},
pubstate = {published},
tppubtype = {inproceedings}
}
Karan Samel, Jun Ma, Zhengyang Wang, Tong Zhao, Irfan Essa
Knowledge Relevance BERT: Integrating Noisy Knowledge into Language Representation. Proceedings Article
In: AAAI workshop on Knowledge Augmented Methods for NLP (KnowledgeNLP-AAAI 2023), 2023.
Abstract | Links | BibTeX | Tags: AI, knowledge representation, NLP
@inproceedings{2023-Samel-KRBINKILR,
title = {Knowledge Relevance BERT: Integrating Noisy Knowledge into Language Representation.},
author = {Karan Samel and Jun Ma and Zhengyang Wang and Tong Zhao and Irfan Essa},
url = {https://knowledge-nlp.github.io/aaai2023/papers/005-KRBERT-oral.pdf},
year = {2023},
date = {2023-02-01},
urldate = {2023-02-01},
booktitle = {AAAI workshop on Knowledge Augmented Methods for NLP (KnowledgeNLP-AAAI 2023)},
abstract = {Integrating structured knowledge into language model representations increases recall of domain-specific information useful for downstream tasks. Matching between knowledge graph entities and text entity mentions can be easily performed when entity names are unique or entity-linking data exists. When extending this setting to new domains, newly mined knowledge contains ambiguous and incorrect information without explicit linking information. In such settings, we design a framework to robustly link relevant knowledge to input texts as an intermediate modeling step while performing end-to-end domain fine-tuning tasks. This is done by first computing the similarity of the existing task labels with candidate knowledge triplets to generate relevance labels. We use these labels to train a relevance model, which predicts the relevance of the inserted triplets to the original text. This relevance model is integrated within a language model, leading to our Knowledge Relevance BERT (KR-BERT) framework. We test KR-BERT for linking and ranking tasks on a real-world e-commerce dataset and a public entity linking task, where we show performance improvements over strong baselines.},
keywords = {AI, knowledge representation, NLP},
pubstate = {published},
tppubtype = {inproceedings}
}
Tianhao Zhang, Weilong Yang, Honglak Lee, Hung-Yu Tseng, Irfan Essa, Lu Jiang
Image manipulation by text instruction Patent
2023.
Abstract | Links | BibTeX | Tags: content creation, generative AI, google, media generation, patents
@patent{2023-Zhang-IMTI,
title = {Image manipulation by text instruction},
author = {Tianhao Zhang and Weilong Yang and Honglak Lee and Hung-Yu Tseng and Irfan Essa and Lu Jiang},
url = {https://patents.google.com/patent/US11562518},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
abstract = {A method for generating an output image from an input image and an input text instruction that specifies a location and a modification of an edit applied to the input image using a neural network is described. The neural network includes an image encoder, an image decoder, and an instruction attention network. The method includes receiving the input image and the input text instruction; extracting, from the input image, an input image feature that represents features of the input image using the image encoder; generating a spatial feature and a modification feature from the input text instruction using the instruction attention network; generating an edited image feature from the input image feature, the spatial feature and the modification feature; and generating the output image from the edited image feature using the image decoder.},
howpublished = {US Patent # US11562518},
keywords = {content creation, generative AI, google, media generation, patents},
pubstate = {published},
tppubtype = {patent}
}
- Categories
- Publications