@article{M5BC929BC, title = "Efficient and Privacy-Preserving Near-Duplicate Detection in Cloud Computing", journal = "Journal of KIISE, JOK", year = "2017", issn = "2383-630X", doi = "10.5626/JOK.2017.44.10.1112", author = "Changhee Hahn,Hyung June Shin,Junbeom Hur", keywords = "near-duplicate detection,searchable encryption,cloud computing,privacy", abstract = "As content providers further offload content-centric services to the cloud, data retrieval over the cloud typically results in many redundant items because there is a prevalent near-duplication of content on the Internet. Simply fetching all data from the cloud severely degrades efficiency in terms of resource utilization and bandwidth, and data can be encrypted by multiple content providers under different keys to preserve privacy. Thus, locating near-duplicate data in a privacy-preserving way is highly dependent on the ability to deduplicate redundant search results and returns best matches without decrypting data. To this end, we propose an efficient near-duplicate detection scheme for encrypted data in the cloud. Our scheme has the following benefits. First, a single query is enough to locate near-duplicate data even if they are encrypted under different keys of multiple content providers. Second, storage, computation and communication costs are alleviated compared to existing schemes, while achieving the same level of search accuracy. Third, scalability is significantly improved as a result of a novel and efficient two-round detection to locate near-duplicate candidates over large quantities of data in the cloud. An experimental analysis with real-world data demonstrates the applicability of the proposed scheme to a practical cloud system. Last, the proposed scheme is an average of 70.6% faster than an existing scheme." }