@article{M710EC6BE, title = "AI Tax: Performance Analysis of AI Inference Serving", journal = "Journal of KIISE, JOK", year = "2026", issn = "2383-630X", doi = "10.5626/JOK.2026.53.1.8", author = "Heetaek Jeong, Jangwoo Kim", keywords = "AI Tax, AI Inference serving, AI pre-processing, End-to-end performance analysis", abstract = "With the rapid advancements in artificial intelligence (AI), smart applications powered by compute- and memory-intensive AI models now make up a significant portion of modern datacenter workloads. To meet the growing demands of AI workloads, specialized accelerators are increasingly deployed in datacenters to enhance AI inference efficiency. However, most previous studies on AI inference acceleration have focused primarily on the performance of neural network computations in isolation. In addition to these computations, an AI inference server typically handles other essential infrastructure tasks, such as web serving to send and receive inference requests and responses, as well as application-specific pre- and post-processing. In this paper, we refer to these additional operations as the AI Tax. We analyze the AI Tax in a representative modern AI inference server that runs various image classification models using Nvidia's industry-standard AI serving software stack. Our findings reveal that the AI Tax can lead to up to 55% degradation in end-to-end server performance compared to standalone neural network compute and consumes an average of 25 CPU cores." }