SlideShare una empresa de Scribd logo
1 de 19
Linux 下 poll 和 epoll 内核源码剖
析
作者:董昊
epoll 简介
• 传统的 poll 在 fd 特别多时性能骤降
• epoll 是比 select 、 poll 更加高效的用于异
步事件处理的系统调用
• 作者 Davide Libenzi (
http://www.xmailserver.org/linux-patches/nio-im
) 已成为 linux 2.6 内核的标配
s/select.c -->sys_poll]
456 asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long timeout)
457 {
458 struct poll_wqueues table;
459 int fdcount, err;
460 unsigned int i;
461 struct poll_list *head;
462 struct poll_list *walk;
463
464 /* Do a sanity check on nfds ... */
/* 用户给的 nfds 数不可以超过一个 struct file 结构支持的最大 fd 数(默认是 256 ) *
465 if (nfds > current->files->max_fdset && nfds > OPEN_MAX)
466 return -EINVAL;
467
468 if (timeout) {
469 /* Careful about overflow in the intermediate values */
470 if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)
471 timeout = (unsigned long)(timeout*HZ+999)/1000+1;
472 else /* Negative or overflow */
473 timeout = MAX_SCHEDULE_TIMEOUT;
474 }
475
[fs/select.c -->sys_poll]
478 head = NULL;
479 walk = NULL;
480 i = nfds;
481 err = -ENOMEM;
482 while(i!=0) {
483 struct poll_list *pp;
484 pp = kmalloc(sizeof(struct poll_list)+
485 sizeof(struct pollfd)*
486 (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i),
487 GFP_KERNEL);
488 if(pp==NULL)
489 goto out_fds;
490 pp->next=NULL;
491 pp->len = (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i);
492 if (head == NULL)
493 head = pp;
494 else
495 walk->next = pp;
496
497 walk = pp;
498 if (copy_from_user(pp->entries, ufds + nfds-i,
499 sizeof(struct pollfd)*pp->len)) {
500 err = -EFAULT;
501 goto out_fds;
502 }
503 i -= pp->len;
504 }
505 fdcount = do_poll(nfds, head, &table, timeout);
• 当用户传入的 fd 很多时,由于 poll 系统调
用每次都要把所有 struct pollfd 拷进内核,
所以参数传递和页分配此时就成了 poll 系
统调用的性能瓶颈。
  427 static int do_poll(unsigned int nfds,  struct poll_list *list,          
  428             struct poll_wqueues *wait, long timeout)                    
  429 {                                                                       
  430     int count = 0;
  431     poll_table* pt = &wait->pt;
  432 
  433     if (!timeout)
  434         pt = NULL;
  435 
  436     for (;;) {
  437         struct poll_list *walk;
  438         set_current_state(TASK_INTERRUPTIBLE);
  439         walk = list;
  440         while(walk != NULL) {
  441             do_pollfd( walk->len, walk->entries, &pt, &count);
  442             walk = walk->next;
  443         }
  444         pt = NULL;
  445         if (count || !timeout || signal_pending(current))
  446             break;
  447         count = wait->error;
  448         if (count)
  449             break;
  450         timeout = schedule_timeout(timeout); 
/*  让 current 挂起,别的进程跑, timeout 到了以后再回来运行 current*/
  451     }
  452     __set_current_state(TASK_RUNNING);
  453     return count;
  454 } 
[fs/select.c-->sys_poll()-->do_poll()]
  395 static void do_pollfd(unsigned int num, struct pollfd * fdpage,
  396     poll_table ** pwait, int *count)
  397 {
  398     int i;
  399 
  400     for (i = 0; i < num; i++) {                                         
  401         int fd;                                                         
  402         unsigned int mask;                                              
  403         struct pollfd *fdp;                                             
  404                                                                         
  405         mask = 0;                                                       
  406         fdp = fdpage+i;                                                 
  407         fd = fdp->fd;                                                   
  408         if (fd >= 0) {                                                  
  409             struct file * file = fget(fd);                              
  410             mask = POLLNVAL;                                            
  411             if (file != NULL) {                                         
  412                 mask = DEFAULT_POLLMASK;                                
  413                 if (file->f_op && file->f_op->poll)                     
  414                     mask = file->f_op->poll(file, *pwait);              
  415                 mask &= fdp->events | POLLERR | POLLHUP;                
  416                 fput(file);                                             
  417             }                                                           
  418             if (mask) {                                                 
  419                 *pwait = NULL;                                          
  420                 (*count)++;                                             
  421             }                                                           
  422         }                                                               
  423         fdp->revents = mask;                                            
  424     }                                                                   
• 注意标红的 440-443 行,当用户传入的 fd
很多时(比如 1000 个),对 do_pollfd 就
会调用很多次, poll 效率瓶颈的另一原因
就在这里。
• 如果 fd 对应的是某个 socket , do_pollfd 调用的
就是网络设备驱动实现的 poll ;如果 fd 对应的是
某个 ext3 文件系统上的一个打开文件, 那
do_pollfd 调用的就是 ext3 文件系统驱动实现的
poll 。一句话,这个 file->f_op->poll 是设备驱动
程序实现的
• 其实,设备驱动程序的标准实现是:调用
poll_wait ,即以设备自己的等待队列为参数(通
常设备都 有自己的等待队列)调用 struct 
poll_table 的回调函数。
epoll 原理
• 首先,每次 poll 都要把 1000 个 fd  拷入内核,太
不科学了,内核干嘛不自己保存已经拷入的 fd 呢
?答对了, epoll 就是自己保存拷入的 fd ,它的
API 就已经说明了这一点——不是 epoll_wait 的
时候才传入 fd ,而是通过 epoll_ctl 把所有 fd 传
入内核再一起“ wait” ,这就省掉了不必要的重复
拷贝。
• 其次,在 epoll_wait 时,也不是把 current 轮流
的加入 fd 对应的设备等待队列,而是在设备等待
队列醒来时调用一个回调函数(当然,这就需要
“唤醒回 调”机制),把产生事件的 fd 归入一个链
表,然后返回这个链表上的 fd 。
[fs/eventpoll.c-->evetpoll_init()]
  1582 static int __init eventpoll_init(void)
  1583 {
  1584     int error;
  1585 
  1586     init_MUTEX(&epsem);
  1587 
  1588     /* Initialize the structure used to perform safe poll wait head wake ups */
  1589     ep_poll_safewake_init(&psw);
…..
  1601     /*
  1602      * Register the virtual file system that will be the source of inodes
  1603      * for the eventpoll files
  1604      */
  1605     error = register_filesystem(&eventpoll_fs_type);
  1606     if (error)
  1607         goto epanic;
  1608 
  1609     /* Mount the above commented virtual file system */
  1610     eventpoll_mnt = kern_mount(&eventpoll_fs_type);
  1611     error = PTR_ERR(eventpoll_mnt);
  1612     if (IS_ERR(eventpoll_mnt))
  1613         goto epanic;
  1614 
  1615     DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.n",
  1616             current));
  1617     return 0;
  1618 
  1619 epanic:
  1620     panic("eventpoll_init() failedn");
  1621 } 
• 这个 module 在初始化时注册了一个新的文件系
统,叫“ eventpollfs” (在 eventpoll_fs_type 结构
里),然后挂载此文 件系统。另外创建两个内核
cache (在内核编程中,如果需要频繁分配小块
内存,应该创建 kmem_cahe 来做“内存池”) , 分
别用于存放 struct epitem 和 eppoll_entry 。
• 想想 epoll_create 为什么会返回一个新的 fd ?因
为它就是在这个叫做 "eventpollfs" 的文件系统里
创建了一个新文件
• 为什么是个新文件系统?
• linux 的系统调用有多少是返回指针的?你会发现
几乎没有!(特此强调, malloc 不是系统调用,
malloc 调用的 brk 才是)因 为 linux 做为 unix 的
最杰出的继承人,它遵循了 unix 的一个巨大有点
—一切皆文件
• 使用文件系统有个好处: epoll_create 返回的是
一个 fd ,而不是该死的指针,指针如果指错了,
你简直没办法判断,而 fd 则可以通 过 current-
>files->fd_array[] 找到其真伪。
[fs/eventpoll.c-->sys_epoll_ctl()]
   524 asmlinkage long
   525 sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
   526 {
....
   575     epi = ep_find(ep, tfile, fd);
   576 
   577     error = -EINVAL;
   578     switch (op) {
   579     case EPOLL_CTL_ADD:
   580         if (!epi) {
   581             epds.events |= POLLERR | POLLHUP;
   582 
   583             error = ep_insert(ep, &epds, tfile, fd);
   584         } else
   585             error = -EEXIST;
   586         break;
   587     case EPOLL_CTL_DEL:
   588         if (epi)
   589             error = ep_remove(ep, epi);
   590         else
   591             error = -ENOENT;
   592         break;
   593     case EPOLL_CTL_MOD:
   594         if (epi) {
   595             epds.events |= POLLERR | POLLHUP;
   596             error = ep_modify(ep, epi, &epds);
   597         } else
   598             error = -ENOENT;
   599         break;
   600     } 
• 看 ep_find 的调用方式, ep 参数应该是指向这个
“大结构”的指针,再看 ep = file->private_data
• 具体看看 ep_find 的实现,发现原来是 struct 
eventpoll 的 rbr 成员( struct rb_root ),是一个
红黑树的根!而红黑树上挂的都是 struct 
epitem 。
• 现在清楚了,一个新创建的 epoll 文件带有一个
struct eventpoll 结构,这个结构上再挂一个红黑
树,而这个红黑树就是每次 epoll_ctl 时 fd 存放的
地方!
[fs/eventpoll.c-->ep_ptable_queue_proc()]
   883 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
   884                  poll_table *pt)
   885 {
   886     struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt);
   887     struct eppoll_entry *pwq;
   888 
   889     if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) {
   890         init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
   891         pwq->whead = whead;
   892         pwq->base = epi;
   893         add_wait_queue(whead, &pwq->wait);
   894         list_add_tail(&pwq->llink, &epi->pwqlist);
   895         epi->nwait++;
   896     } else {
   897         /* We have to signal that an error occurred */
   898         epi->nwait = -1;
   899     }
   900 } 
• 上面的代码就是 ep_insert 中要做的最重要的事:
创建 struct eppoll_entry ,设置其唤醒回调函数为
ep_poll_callback ,然后加入设备等待队列(注意
这里的 whead 就是上一章所说的每个设 备驱动
都要带的等待队列)
• 只有这样,当设备就绪,唤醒等待队列上的等待
着时, ep_poll_callback 就会被调用。 poll 是自
己挨个去问每个 fd“ 好没有?”、“好没
有?“( poll ),而 epoll 是给了每个 fd 一个命令
“好了就调我的函数!”,然后就去睡了,等着设备
驱动通知它 ....epoll 的方法显然更高效
为什么研究内核?
• 要获得最高的服务器使用效能,就必须优
化所有软件,而操作系统它不过也就是一
个软件。
• 知其然,亦知其所以然
• 本 ppt 全文
– http://donghao.org/2009/08/linuxiapolliepollau
eouaeaeeio.html

Más contenido relacionado

La actualidad más candente

Oprofile linux
Oprofile linuxOprofile linux
Oprofile linuxFeng Yu
 
Redis 存储分片之代理服务twemproxy 测试
Redis 存储分片之代理服务twemproxy 测试Redis 存储分片之代理服务twemproxy 测试
Redis 存储分片之代理服务twemproxy 测试kaerseng
 
Juniper ScreenOS 基于Policy的
Juniper ScreenOS 基于Policy的Juniper ScreenOS 基于Policy的
Juniper ScreenOS 基于Policy的mickchen
 
Heartbeat+my sql+drbd构建高可用mysql方案
Heartbeat+my sql+drbd构建高可用mysql方案Heartbeat+my sql+drbd构建高可用mysql方案
Heartbeat+my sql+drbd构建高可用mysql方案cao jincheng
 
ch9-pv1-the-extended-filesystem-family
ch9-pv1-the-extended-filesystem-familych9-pv1-the-extended-filesystem-family
ch9-pv1-the-extended-filesystem-familyyushiang fu
 

La actualidad más candente (6)

Oprofile linux
Oprofile linuxOprofile linux
Oprofile linux
 
Redis 存储分片之代理服务twemproxy 测试
Redis 存储分片之代理服务twemproxy 测试Redis 存储分片之代理服务twemproxy 测试
Redis 存储分片之代理服务twemproxy 测试
 
Juniper ScreenOS 基于Policy的
Juniper ScreenOS 基于Policy的Juniper ScreenOS 基于Policy的
Juniper ScreenOS 基于Policy的
 
Heartbeat+my sql+drbd构建高可用mysql方案
Heartbeat+my sql+drbd构建高可用mysql方案Heartbeat+my sql+drbd构建高可用mysql方案
Heartbeat+my sql+drbd构建高可用mysql方案
 
ch9-pv1-the-extended-filesystem-family
ch9-pv1-the-extended-filesystem-familych9-pv1-the-extended-filesystem-family
ch9-pv1-the-extended-filesystem-family
 
Linux
LinuxLinux
Linux
 

Destacado

Epoll - from the kernel side
Epoll -  from the kernel sideEpoll -  from the kernel side
Epoll - from the kernel sidellj098
 
Capturando pacotes de rede no kernelspace
Capturando pacotes de rede no kernelspaceCapturando pacotes de rede no kernelspace
Capturando pacotes de rede no kernelspaceCampus Party Brasil
 
Data Structures used in Linux kernel
Data Structures used in Linux kernel Data Structures used in Linux kernel
Data Structures used in Linux kernel assinha
 
epoll() - The I/O Hero
epoll() - The I/O Heroepoll() - The I/O Hero
epoll() - The I/O HeroMohsin Hijazee
 
Uma visão do mundo rails campus party 2011 - fabio akita
Uma visão do mundo rails   campus party 2011 - fabio akitaUma visão do mundo rails   campus party 2011 - fabio akita
Uma visão do mundo rails campus party 2011 - fabio akitaCampus Party Brasil
 
Open Source Systems Performance
Open Source Systems PerformanceOpen Source Systems Performance
Open Source Systems PerformanceBrendan Gregg
 
DevConf 2014 Kernel Networking Walkthrough
DevConf 2014   Kernel Networking WalkthroughDevConf 2014   Kernel Networking Walkthrough
DevConf 2014 Kernel Networking WalkthroughThomas Graf
 
Browsing Linux Kernel Source
Browsing Linux Kernel SourceBrowsing Linux Kernel Source
Browsing Linux Kernel SourceMotaz Saad
 
introduction to linux kernel tcp/ip ptocotol stack
introduction to linux kernel tcp/ip ptocotol stack introduction to linux kernel tcp/ip ptocotol stack
introduction to linux kernel tcp/ip ptocotol stack monad bobo
 
The linux networking architecture
The linux networking architectureThe linux networking architecture
The linux networking architecturehugo lu
 
The TCP/IP Stack in the Linux Kernel
The TCP/IP Stack in the Linux KernelThe TCP/IP Stack in the Linux Kernel
The TCP/IP Stack in the Linux KernelDivye Kapoor
 
Linux architecture
Linux architectureLinux architecture
Linux architecturemcganesh
 
Blazing Performance with Flame Graphs
Blazing Performance with Flame GraphsBlazing Performance with Flame Graphs
Blazing Performance with Flame GraphsBrendan Gregg
 
Linux Performance Analysis: New Tools and Old Secrets
Linux Performance Analysis: New Tools and Old SecretsLinux Performance Analysis: New Tools and Old Secrets
Linux Performance Analysis: New Tools and Old SecretsBrendan Gregg
 
Linux Systems Performance 2016
Linux Systems Performance 2016Linux Systems Performance 2016
Linux Systems Performance 2016Brendan Gregg
 
Broken Linux Performance Tools 2016
Broken Linux Performance Tools 2016Broken Linux Performance Tools 2016
Broken Linux Performance Tools 2016Brendan Gregg
 
Velocity 2015 linux perf tools
Velocity 2015 linux perf toolsVelocity 2015 linux perf tools
Velocity 2015 linux perf toolsBrendan Gregg
 

Destacado (20)

Epoll - from the kernel side
Epoll -  from the kernel sideEpoll -  from the kernel side
Epoll - from the kernel side
 
Capturando pacotes de rede no kernelspace
Capturando pacotes de rede no kernelspaceCapturando pacotes de rede no kernelspace
Capturando pacotes de rede no kernelspace
 
Data Structures used in Linux kernel
Data Structures used in Linux kernel Data Structures used in Linux kernel
Data Structures used in Linux kernel
 
epoll() - The I/O Hero
epoll() - The I/O Heroepoll() - The I/O Hero
epoll() - The I/O Hero
 
Uma visão do mundo rails campus party 2011 - fabio akita
Uma visão do mundo rails   campus party 2011 - fabio akitaUma visão do mundo rails   campus party 2011 - fabio akita
Uma visão do mundo rails campus party 2011 - fabio akita
 
Open Source Systems Performance
Open Source Systems PerformanceOpen Source Systems Performance
Open Source Systems Performance
 
DevConf 2014 Kernel Networking Walkthrough
DevConf 2014   Kernel Networking WalkthroughDevConf 2014   Kernel Networking Walkthrough
DevConf 2014 Kernel Networking Walkthrough
 
Browsing Linux Kernel Source
Browsing Linux Kernel SourceBrowsing Linux Kernel Source
Browsing Linux Kernel Source
 
introduction to linux kernel tcp/ip ptocotol stack
introduction to linux kernel tcp/ip ptocotol stack introduction to linux kernel tcp/ip ptocotol stack
introduction to linux kernel tcp/ip ptocotol stack
 
Socket System Calls
Socket System CallsSocket System Calls
Socket System Calls
 
The linux networking architecture
The linux networking architectureThe linux networking architecture
The linux networking architecture
 
The TCP/IP Stack in the Linux Kernel
The TCP/IP Stack in the Linux KernelThe TCP/IP Stack in the Linux Kernel
The TCP/IP Stack in the Linux Kernel
 
Basic Linux Internals
Basic Linux InternalsBasic Linux Internals
Basic Linux Internals
 
Linux architecture
Linux architectureLinux architecture
Linux architecture
 
Blazing Performance with Flame Graphs
Blazing Performance with Flame GraphsBlazing Performance with Flame Graphs
Blazing Performance with Flame Graphs
 
Linux Performance Analysis: New Tools and Old Secrets
Linux Performance Analysis: New Tools and Old SecretsLinux Performance Analysis: New Tools and Old Secrets
Linux Performance Analysis: New Tools and Old Secrets
 
Linux Systems Performance 2016
Linux Systems Performance 2016Linux Systems Performance 2016
Linux Systems Performance 2016
 
Broken Linux Performance Tools 2016
Broken Linux Performance Tools 2016Broken Linux Performance Tools 2016
Broken Linux Performance Tools 2016
 
Architecture Of The Linux Kernel
Architecture Of The Linux KernelArchitecture Of The Linux Kernel
Architecture Of The Linux Kernel
 
Velocity 2015 linux perf tools
Velocity 2015 linux perf toolsVelocity 2015 linux perf tools
Velocity 2015 linux perf tools
 

Similar a Linux下Poll和Epoll内核源码剖析

Linux Tracing System 浅析 & eBPF框架开发经验分享
Linux Tracing System 浅析 & eBPF框架开发经验分享Linux Tracing System 浅析 & eBPF框架开发经验分享
Linux Tracing System 浅析 & eBPF框架开发经验分享happyagan
 
Bigdata 大資料分析實務 (進階上機課程)
Bigdata 大資料分析實務 (進階上機課程)Bigdata 大資料分析實務 (進階上機課程)
Bigdata 大資料分析實務 (進階上機課程)家雋 莊
 
文件系统简述.pptx
文件系统简述.pptx文件系统简述.pptx
文件系统简述.pptxGuoliangDing3
 
Osc scott linux下的数据库优化for_postgresql
Osc scott linux下的数据库优化for_postgresqlOsc scott linux下的数据库优化for_postgresql
Osc scott linux下的数据库优化for_postgresqlOpenSourceCamp
 
Hadoop+spark實作
Hadoop+spark實作Hadoop+spark實作
Hadoop+spark實作FEG
 
ch13-pv1-system-calls
ch13-pv1-system-callsch13-pv1-system-calls
ch13-pv1-system-callsyushiang fu
 
Device Driver - Chapter 3字元驅動程式
Device Driver - Chapter 3字元驅動程式Device Driver - Chapter 3字元驅動程式
Device Driver - Chapter 3字元驅動程式ZongYing Lyu
 

Similar a Linux下Poll和Epoll内核源码剖析 (7)

Linux Tracing System 浅析 & eBPF框架开发经验分享
Linux Tracing System 浅析 & eBPF框架开发经验分享Linux Tracing System 浅析 & eBPF框架开发经验分享
Linux Tracing System 浅析 & eBPF框架开发经验分享
 
Bigdata 大資料分析實務 (進階上機課程)
Bigdata 大資料分析實務 (進階上機課程)Bigdata 大資料分析實務 (進階上機課程)
Bigdata 大資料分析實務 (進階上機課程)
 
文件系统简述.pptx
文件系统简述.pptx文件系统简述.pptx
文件系统简述.pptx
 
Osc scott linux下的数据库优化for_postgresql
Osc scott linux下的数据库优化for_postgresqlOsc scott linux下的数据库优化for_postgresql
Osc scott linux下的数据库优化for_postgresql
 
Hadoop+spark實作
Hadoop+spark實作Hadoop+spark實作
Hadoop+spark實作
 
ch13-pv1-system-calls
ch13-pv1-system-callsch13-pv1-system-calls
ch13-pv1-system-calls
 
Device Driver - Chapter 3字元驅動程式
Device Driver - Chapter 3字元驅動程式Device Driver - Chapter 3字元驅動程式
Device Driver - Chapter 3字元驅動程式
 

Más de Hao(Robin) Dong

Más de Hao(Robin) Dong (9)

Transformer and BERT
Transformer and BERTTransformer and BERT
Transformer and BERT
 
Google TPU
Google TPUGoogle TPU
Google TPU
 
flashcache原理及改造
flashcache原理及改造flashcache原理及改造
flashcache原理及改造
 
ext2-110628041727-phpapp02
ext2-110628041727-phpapp02ext2-110628041727-phpapp02
ext2-110628041727-phpapp02
 
Ext4 Bigalloc report public
Ext4 Bigalloc report publicExt4 Bigalloc report public
Ext4 Bigalloc report public
 
Overlayfs and VFS
Overlayfs and VFSOverlayfs and VFS
Overlayfs and VFS
 
Ext4 new feature - bigalloc
Ext4 new feature - bigallocExt4 new feature - bigalloc
Ext4 new feature - bigalloc
 
why we need ext4
why we need ext4why we need ext4
why we need ext4
 
Kernel在多核机器上的负载均衡机制
Kernel在多核机器上的负载均衡机制Kernel在多核机器上的负载均衡机制
Kernel在多核机器上的负载均衡机制
 

Linux下Poll和Epoll内核源码剖析