@article{MA98D5124,
title = "Semi-automatic Expansion for a Chatting Corpus Based on a K-means Clustering Method And Similarity Measure",
journal = "Journal of KIISE, JOK",
year = "2019",
issn = "2383-630X",
doi = "10.5626/JOK.2019.46.5.440",
author = "Jaehyun An,Youngjoong Ko",
keywords = "chatting system,semi-automatic expansion,similarity,convolutional neural networks,utterance embedding",
abstract = "In this paper, we proposed a semi-automatic expansion method to expand a chatting corpus using a large amount of utterance data from movie subtitles and drama scripts. To expand the chatting corpus, the proposed system used previously constructed chatting corpus and a similarity measure. If the similarity is calculated between a previously constructed chatting corpus and the input utterance was greater than a threshold value set in the experiment, the input utterance was selected as a new chatting utterance, that it is a correct chatting pair. We used morpheme-unit word embeddings and a Convolutional Neural Networks to efficiently calculate the similarity of the utterance embedding. In order to improve the speed of the semi-automatic expansion process, we proposed to reduce the amount of computation by clustering chat corpus by K-means clustering algorithm. Experimental results showed that the precision, recall, and F1 score of the proposed system were 61.28%, 53.19%, and 56.94%, respectively, which was 5.16%p, 6.09%, and 5.73%p higher than that of the baseline system. The term frequency and the speed of our system were also about a hundred times faster."
}