@inproceedings{2023-Warner-TACIFUOVIS,
title = {Text and Click inputs for unambiguous open vocabulary instance segmentation},
author = {Nikolai Warner and Meera Hahn and Jonathan Huang and Irfan Essa and Vighnesh Birodkar},
url = {https://doi.org/10.48550/arXiv.2311.14822
https://arxiv.org/abs/2311.14822
https://arxiv.org/pdf/2311.14822.pdf},
doi = {arXiv.2311.14822},
year = {2023},
date = {2023-11-24},
urldate = {2023-11-24},
booktitle = {Proeedings of British Conference for Machine Vision (BMVC)},
abstract = {Segmentation localizes objects in an image on a fine-grained per-pixel scale. Segmentation benefits by humans-in-the-loop to provide additional input of objects to segment using a combination of foreground or background clicks. Tasks include photoediting or novel dataset annotation, where human annotators leverage an existing segmentation model instead of drawing raw pixel level annotations. We propose a new segmentation process, Text + Click segmentation, where a model takes as input an image, a text phrase describing a class to segment, and a single foreground click specifying the instance to segment. Compared to previous approaches, we leverage open-vocabulary image-text models to support a wide-range of text prompts. Conditioning segmentations on text prompts improves the accuracy of segmentations on novel or unseen classes. We demonstrate that the combination of a single user-specified foreground click and a text prompt allows a model to better disambiguate overlapping or co-occurring semantic categories, such as "tie", "suit", and "person". We study these results across common segmentation datasets such as refCOCO, COCO, VOC, and OpenImages. Source code available here.
},
keywords = {arXiv, BMVC, computer vision, google, image segmentation},
pubstate = {published},
tppubtype = {inproceedings}
}