@article{M05657282, title = "Korean Paper Based Retrieval Augmented Generation Dataset", journal = "Journal of KIISE, JOK", year = "2026", issn = "2383-630X", doi = "10.5626/JOK.2026.53.3.205", author = "Junho Han, Minjun Choi, Keunha Kim, Youngjoong Ko", keywords = "large language model, retrieval-augmented generation, information retrieval, keyphrase extraction, response generation evaluation", abstract = "Large language models (LLMs) trained on general domain data have limitations in specialized fields that are rich in information and technical terminology. Retrieval-augmented generation (RAG) improves answer accuracy and reliability by referencing external knowledge, making it particularly effective in specialized domains where pre-training data is scarce. However, there is a lack of public datasets for Korean specialized domains, highlighting the need for a dedicated retrieval-augmented generation dataset. This paper introduces a new Korean RAG dataset based on scientific and technical papers to support research in this area. We preprocessed existing document-query data to create a searchable corpus and extracted key phrases and key sentences suited for specialized applications. Additionally, we conducted a comprehensive quantitative evaluation of the dataset‘s quality. By reflecting the unique characteristics of scientific and technical papers, this dataset serves as a robust foundation for Korean RAG systems." }