@article{M50093CF1, title = "Document-level Machine Translation Data Augmentation Using a Cluster Algorithm and NSP", journal = "Journal of KIISE, JOK", year = "2023", issn = "2383-630X", doi = "10.5626/JOK.2023.50.5.401", author = "Dokyoung Kim,Changki Lee", keywords = "neural machine translation,document-level machine translation,data augmentation,G-Transformer,NSP(Next Sentence Prediction)", abstract = "In recent years, research on document level machine translation has been actively conducted to understand the context of the entire document and perform natural translation. Similar to the sentence-level machine translation model, a large amount of training data is required for training of the document-level machine translation model, but there is great difficulty in building a large amount of document-level parallel corpus. Therefore, in this paper, we propose a data augmentation technique effective for document-level machine translation in order to improve the lack of parallel corpus per document. As a result of the experiment, by applying the data augmentation technique using the cluster algorithm and NSP to the sentence unit parallel corpus without context, the performance of the document-level machine translation is improved by S-BLEU 3.0 and D-BLEU 2.7 compared to that before application of the data augmentation technique." }