A searchable list of some of my publications is below. You can also access my publications from the following sites.
My ORCID is
Publications:
Seung Hyun Lee, Yinxiao Li, Junjie Ke, Innfarn Yoo, Han Zhang, Jiahui Yu, Qifei Wang, Fei Deng, Glenn Entis, Junfeng He, Gang Li, Sangpil Kim, Irfan Essa, Feng Yang
Parrot: Pareto-optimal multi-reward reinforcement learning framework for text-to-image generation (inproceedings) Proceedings Article
In: Proceedings of European Conference on Computer Vision (ECCV) , 2024.
Abstract | Links | BibTeX | Tags: arXiv, computer vision, ECCV, generative AI, google, reinforcement learning
@inproceedings{2024-Lee-PPMRLFTG,
title = {Parrot: Pareto-optimal multi-reward reinforcement learning framework for text-to-image generation (inproceedings)},
author = {Seung Hyun Lee and Yinxiao Li and Junjie Ke and Innfarn Yoo and Han Zhang and Jiahui Yu and Qifei Wang and Fei Deng and Glenn Entis and Junfeng He and Gang Li and Sangpil Kim and Irfan Essa and Feng Yang
},
url = {https://arxiv.org/abs/2401.05675
https://arxiv.org/pdf/2401.05675
https://dl.acm.org/doi/10.1007/978-3-031-72920-1_26},
doi = {10.48550/arXiv.2401.05675},
year = {2024},
date = {2024-07-25},
urldate = {2024-07-25},
booktitle = {Proceedings of European Conference on Computer Vision (ECCV)
},
abstract = {Recent works have demonstrated that using reinforcement learning (RL) with multiple quality rewards can improve the quality of generated images in text-to-image (T2I) generation. However, manually adjusting reward weights poses challenges and may cause over-optimization in certain metrics. To solve this, we propose Parrot, which addresses the issue through multi-objective optimization and introduces an effective multi-reward optimization strategy to approximate Pareto optimal. Utilizing batch-wise Pareto optimal selection, Parrot automatically identifies the optimal trade-off among different rewards. We use the novel multi-reward optimization algorithm to jointly optimize the T2I model and a prompt expansion network, resulting in significant improvement of image quality and also allow to control the trade-off of different rewards using a reward related prompt during inference. Furthermore, we introduce original prompt-centered guidance at inference time, ensuring fidelity to user input after prompt expansion. Extensive experiments and a user study validate the superiority of Parrot over several baselines across various quality criteria, including aesthetics, human preference, text-image alignment, and image sentiment.
},
keywords = {arXiv, computer vision, ECCV, generative AI, google, reinforcement learning},
pubstate = {published},
tppubtype = {inproceedings}
}
José Lezama, Huiwen Chang, Lu Jiang, Irfan Essa
Improved Masked Image Generation with Token-Critic Proceedings Article
In: European Conference on Computer Vision (ECCV), arXiv, 2022, ISBN: 978-3-031-20050-2.
Abstract | Links | BibTeX | Tags: computer vision, ECCV, generative AI, generative media, google
@inproceedings{2022-Lezama-IMIGWT,
title = {Improved Masked Image Generation with Token-Critic},
author = {José Lezama and Huiwen Chang and Lu Jiang and Irfan Essa},
url = {https://arxiv.org/abs/2209.04439
https://rdcu.be/c61MZ},
doi = {10.1007/978-3-031-20050-2_5},
isbn = {978-3-031-20050-2},
year = {2022},
date = {2022-10-28},
urldate = {2022-10-28},
booktitle = {European Conference on Computer Vision (ECCV)},
volume = {13683},
publisher = {arXiv},
abstract = {Non-autoregressive generative transformers recently demonstrated impressive image generation performance, and orders of magnitude faster sampling than their autoregressive counterparts. However, optimal parallel sampling from the true joint distribution of visual tokens remains an open challenge. In this paper we introduce Token-Critic, an auxiliary model to guide the sampling of a non-autoregressive generative transformer. Given a masked-and-reconstructed real image, the Token-Critic model is trained to distinguish which visual tokens belong to the original image and which were sampled by the generative transformer. During non-autoregressive iterative sampling, Token-Critic is used to select which tokens to accept and which to reject and resample. Coupled with Token-Critic, a state-of-the-art generative transformer significantly improves its performance, and outperforms recent diffusion models and GANs in terms of the trade-off between generated image quality and diversity, in the challenging class-conditional ImageNet generation.},
keywords = {computer vision, ECCV, generative AI, generative media, google},
pubstate = {published},
tppubtype = {inproceedings}
}
Xiang Kong, Lu Jiang, Huiwen Chang, Han Zhang, Yuan Hao, Haifeng Gong, Irfan Essa
BLT: Bidirectional Layout Transformer for Controllable Layout Generation Proceedings Article
In: European Conference on Computer Vision (ECCV), 2022, ISBN: 978-3-031-19789-5.
Abstract | Links | BibTeX | Tags: computer vision, ECCV, generative AI, generative media, google, vision transformer
@inproceedings{2022-Kong-BLTCLG,
title = {BLT: Bidirectional Layout Transformer for Controllable Layout Generation},
author = {Xiang Kong and Lu Jiang and Huiwen Chang and Han Zhang and Yuan Hao and Haifeng Gong and Irfan Essa},
url = {https://arxiv.org/abs/2112.05112
https://rdcu.be/c61AE},
doi = {10.1007/978-3-031-19790-1_29},
isbn = {978-3-031-19789-5},
year = {2022},
date = {2022-10-25},
urldate = {2022-10-25},
booktitle = {European Conference on Computer Vision (ECCV)},
volume = {13677},
abstract = {Creating visual layouts is a critical step in graphic design. Automatic generation of such layouts is essential for scalable and diverse visual designs. To advance conditional layout generation, we introduce BLT, a bidirectional layout transformer. BLT differs from previous work on transformers in adopting non-autoregressive transformers. In training, BLT learns to predict the masked attributes by attending to surrounding attributes in two directions. During inference, BLT first generates a draft layout from the input and then iteratively refines it into a high-quality layout by masking out low-confident attributes. The masks generated in both training and inference are controlled by a new hierarchical sampling policy. We verify the proposed model on six benchmarks of diverse design tasks. Experimental results demonstrate two benefits compared to the state-of-the-art layout transformer models. First, our model empowers layout transformers to fulfill controllable layout generation. Second, it achieves up to 10x speedup in generating a layout at inference time than the layout transformer baseline. Code is released at https://shawnkx.github.io/blt.},
keywords = {computer vision, ECCV, generative AI, generative media, google, vision transformer},
pubstate = {published},
tppubtype = {inproceedings}
}
Hsin-Ying Lee, Lu Jiang, Irfan Essa, Madison Le, Haifeng Gong, Ming-Hsuan Yang, Weilong Yang
Neural Design Network: Graphic Layout Generation with Constraints Proceedings Article
In: Proceedings of European Conference on Computer Vision (ECCV), 2020.
Links | BibTeX | Tags: computer vision, content creation, ECCV, generative media, google
@inproceedings{2020-Lee-NDNGLGWC,
title = {Neural Design Network: Graphic Layout Generation with Constraints},
author = {Hsin-Ying Lee and Lu Jiang and Irfan Essa and Madison Le and Haifeng Gong and Ming-Hsuan Yang and Weilong Yang},
url = {https://arxiv.org/abs/1912.09421
https://rdcu.be/c7sqw},
doi = {10.1007/978-3-030-58580-8_29},
year = {2020},
date = {2020-08-01},
urldate = {2020-08-01},
booktitle = {Proceedings of European Conference on Computer Vision (ECCV)},
keywords = {computer vision, content creation, ECCV, generative media, google},
pubstate = {published},
tppubtype = {inproceedings}
}
Glenn Hartmann, Matthias Grundmann, Judy Hoffman, David Tsai, Vivek Kwatra, Omid Madani, Sudheendra Vijayanarasimhan, Irfan Essa, James Rehg, Rahul Sukthankar
Weakly Supervised Learning of Object Segmentations from Web-Scale Videos Best Paper Proceedings Article
In: Proceedings of ECCV 2012 Workshop on Web-scale Vision and Social Media, 2012.
Abstract | Links | BibTeX | Tags: awards, best paper award, computer vision, ECCV, machine learning
@inproceedings{2012-Hartmann-WSLOSFWV,
title = {Weakly Supervised Learning of Object Segmentations from Web-Scale Videos},
author = {Glenn Hartmann and Matthias Grundmann and Judy Hoffman and David Tsai and Vivek Kwatra and Omid Madani and Sudheendra Vijayanarasimhan and Irfan Essa and James Rehg and Rahul Sukthankar},
url = {https://link.springer.com/chapter/10.1007/978-3-642-33863-2_20
https://research.google.com/pubs/archive/40735.pdf
},
doi = {10.1007/978-3-642-33863-2_20},
year = {2012},
date = {2012-10-01},
urldate = {2012-10-01},
booktitle = {Proceedings of ECCV 2012 Workshop on Web-scale Vision and Social Media},
abstract = {We propose to learn pixel-level segmentations of objects from weakly labeled (tagged) internet videos. Specifically, given a large collection of raw YouTube content, along with potentially noisy tags, our goal is to automatically generate spatiotemporal masks for each object, such as “dog”, without employing any pre-trained object detectors. We formulate this problem as learning weakly supervised classifiers for a set of independent spatio-temporal segments. The object seeds obtained using segment-level classifiers are further refined using graphcuts to generate high-precision object masks. Our results, obtained by training on a dataset of 20,000 YouTube videos weakly tagged into 15 classes, demonstrate automatic extraction of pixel-level object masks. Evaluated against a ground-truthed subset of 50,000 frames with pixel-level annotations, we confirm that our proposed methods can learn good object masks just by watching YouTube.
},
keywords = {awards, best paper award, computer vision, ECCV, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Other Publication Sites
A few more sites that aggregate research publications: Academic.edu, Bibsonomy, CiteULike, Mendeley.
Copyright/About
[Please see the Copyright Statement that may apply to the content listed here.]
This list of publications is produced by using the teachPress plugin for WordPress.