@article{M9C53ACE3,
title = "Enhancing Molecular Understanding in LLMs through Multimodal Graph-SMILES Representations",
journal = "Journal of KIISE, JOK",
year = "2025",
issn = "2383-630X",
doi = "10.5626/JOK.2025.52.5.379",
author = "Wooseong Cho, Minjun Kang, Jaekoo Lee, Eunseong Choi, Jongwuk Lee",
keywords = "Molecule, graph, Multimodal, Large Language Models (LLMs)",
abstract = "Recent advancements in large language models (LLMs) have shown remarkable performace across various tasks, with increasing focus on multimodal research. Notably, BLIP-2 can enhance performance by efficiently aligning images and text using a Q-Former, aided by an image encoder pre-trained on multimodal data. Inspired by this, the MolCA model extends BLIP-2 to the molecular domain to improve performance. However, the graph encoder in MolCA is pre-trained on unimodal data, necessitating updates during model training, which is a limitation. Therefore, this paper replaced it with a graph encoder pre-trained on multimodal data and frozen while training the model. Experimental results showed that using the graph encoder pre-trained on multimodal data generally enhanced performance. Additionally, unlike the graph encoder pre-trained on unimodal data, which performed better when updated, the graph encoder pre-trained on multimodal data achieved superior results across all metrics when frozen."
}