@article{M5DB1FA18,
title = "SyllaBERT: A Syllable-Based Efficient Robust Transformer Model for Real-World Noise and Typographical Errors",
journal = "Journal of KIISE, JOK",
year = "2025",
issn = "2383-630X",
doi = "10.5626/JOK.2025.52.3.250",
author = "Seongwan Park, Yumin Heo, Youngjoong Ko",
keywords = "Korean language model, syllable, tokenizer, misspelling, natural language processing",
abstract = "Training a Korean language model necessitates the development of a tokenizer specifically designed for the unique features of the Korean language, making this a crucial step in the modeling process. Most current language models utilize morpheme-based or subword-based tokenization. While these approaches work well with clean Korean text data, they are prone to out-of-vocabulary (OOV) issues due to abbreviations and neologisms frequently encountered in real-world Korean data. Moreover, actual Korean text often contains various typos and non-standard expressions, to which traditional morpheme-based or subword-based tokenizers are not sufficiently robust. To tackle these challenges, this paper introduces the SyllaBERT model, which employs syllable-level tokenization to effectively address the specific characteristics of Korean, even in noisy and non-standard contexts, with minimal resources. A compact syllable-level vocabulary was created, and a syllable-based language model was developed by reducing the embedding and hidden layer sizes of existing models. Experimental results show that, despite having approximately four times fewer parameters than subword-based models, the SyllaBERT model outperforms them in natural language understanding tasks on real-world conversational Korean data that includes noise."
}