@article{M0F4774A5,
title = "Zero-shot Referring Image Segmentation using Referring Expression Augmentation and Mask Aggregation",
journal = "Journal of KIISE, JOK",
year = "2026",
issn = "2383-630X",
doi = "10.5626/JOK.2026.53.1.15",
author = "Seungheon Song, Sungsik Kim, Junghyeon Seo, Jaekoo Lee",
keywords = "computer vision, image segmentation, reference image segmentation, large language model",
abstract = "With advancements in computer vision technology, image segmentation tasks are increasingly utilized across various fields. Among these, reference image segmentation is particularly valuable for achieving precise regional segmentation based on user instructions. In this paper, we introduce a reference image segmentation framework inspired by human cognitive processes, which utilizes prior knowledge to recognize objects. Our method employs a large language model to infer various visual attributes of objects and integrates these inferences with mask proposals. Consequently, our approach enhances oIoU and mIoU performance by 0.41% and 0.74%, respectively, compared to existing methods on the RefCOCO, RefCOCO+, and RefCOCOg benchmark datasets."
}