SlideShare una empresa de Scribd logo
1 de 19
Descargar para leer sin conexión
First Attempt at Patching Kernel
  A Simple Fix of VMA Merging Issue


       Caspar Zhang @ linuxfb

         caspar@casparzhang.com


         September 19, 2011
Agenda


  Background


  Issue Spotted


  Analysis


  Patchwork




                  First Attempt at Patching Kernel   2/19
Background
     Glossary: VMA
     < linux/mm types.h >: struct vm area struct
     $ cat/proc/ < pid > /maps
     mbind(): Set NUMA policy for a memory range
     Glossary: NUMA




                       First Attempt at Patching Kernel   3/19
Issue Spotted

  An upstream commit with reproducer.
  commit 9d8cebd4bcd7c3878462fdfda34bbcdeb4df7ef4
  Author: KOSAKI Motohiro < kosaki.motohiro@jp. f u jitsu.com >
  Date: Fri Mar 5 13:41:57 2010 -0800

        mm: fix mbind vma merge problem

     Strangely, current mbind() doesn’t merge vma with neighbor vma
  although it’s possible.
     Unfortunately, many vma can reduce performance...

        This patch fixes it.

        reproduced program
  ...


                              First Attempt at Patching Kernel        4/19
Reproducer

 1          addr = mmap(NULL, pagesize*3, PROT_READ|PROT_WRITE,
 2                  MAP_ANON|MAP_PRIVATE, 0, 0);
 3          if (addr == MAP_FAILED)
 4              perror("mmap "), exit(1);
 5
 6          /* make page populate */
 7          memset(addr, 0, pagesize*3);
 8
 9          /* first mbind */
10          err = mbind(addr+pagesize, pagesize, MPOL_BIND, nmask->maskp,
11                  nmask->size, MPOL_MF_MOVE_ALL);
12
13          /* second mbind */
14          err = mbind(addr, pagesize*3, MPOL_DEFAULT, NULL, 0, 0);


     mmap:               |==========================|
     mbind1: ...|--------|========|oooooooo|========|--------|...
     mbind2: ...|--------|========|========|========|--------|...
                A        B        C        D        E        F



                                First Attempt at Patching Kernel            5/19
Issue Spotted (cont.)

   An upstream commit with reproducer(cont.)
   result without this patch
   addr = 0x7fe26ef09000
   [snip]
   7fe26ef09000-7fe26ef0a000 rw-p 00000000 00:00 0
   7fe26ef0a000-7fe26ef0b000 rw-p 00000000 00:00 0
   7fe26ef0b000-7fe26ef0c000 rw-p 00000000 00:00 0
   7fe26ef0c000-7fe26ef0d000 rw-p 00000000 00:00 0
   => 0x7fe26ef09000-0x7fe26ef0c000 have three vmas.

   result with this patch
   addr = 0x7fc9ebc76000
   [snip]
   7fc9ebc76000-7fc9ebc7a000 rw-p 00000000 00:00 0
   7fffbe690000-7fffbe6a5000 rw-p 00000000 00:00 0 [stack]
   => 0x7fc9ebc76000-0x7fc9ebc7a000 have only one vma.

                        First Attempt at Patching Kernel     6/19
Issue Spotted (cont.)
      port the reproducer to LTP — not work
      fix bug in LTP — still not work
      suspect Kernel bug




                        First Attempt at Patching Kernel   7/19
Analysis - mbind range()

 1   /* Step 2: apply policy to a range and do splits. */
 2   static int mbind_range(struct mm_struct *mm, unsigned long start,
 3                  unsigned long end, struct mempolicy *new_pol)
 4   {
 5       struct vm_area_struct *next;
 6       struct vm_area_struct *prev;
 7       struct vm_area_struct *vma;
 8       int err = 0;
 9       pgoff_t pgoff;
10       unsigned long vmstart;
11       unsigned long vmend;
12
13       vma = find_vma_prev(mm, start, &prev);
14       if (!vma || vma->vm_start > start)
15           return -EFAULT;


               start                       end
     ...|--------|========|========|========|--------|...
        A prev B vma      C        D        E        F
             vma->start
                                 First Attempt at Patching Kernel        8/19
Analysis - loop

 1     for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 2         next = vma->vm_next;
 3         vmstart = max(start, vma->vm_start);
 4         vmend = min(end, vma->vm_end);
 5
 6         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 7         prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 8                   vma->anon_vma, vma->vm_file, pgoff, new_pol);
 9         if (prev) {
10             vma = prev;
11             next = vma->vm_next;
12             continue;
13         }
14         [snip]
15     }


               start                       end
     ...|--------|========|========|========|--------|...
        A prev B vma      C        D        E        F
             vma->start
                               First Attempt at Patching Kernel               9/19
Analysis - snip part
     split if merged out of range:
 1      vmstart = max(start, vma->vm_start);
 2      vmend = min(end, vma->vm_end);
 3      ...
 4
 5         if (vma->vm_start != vmstart) {
 6             err = split_vma(vma->vm_mm, vma, vmstart, 1);
 7             if (err)
 8                 goto out;
 9         }
10         if (vma->vm_end != vmend) {
11             err = split_vma(vma->vm_mm, vma, vmend, 0);
12             if (err)
13                 goto out;
14         }
15         err = policy_vma(vma, new_pol);
16         if (err)
17             goto out;




                                First Attempt at Patching Kernel   10/19
Analysis - vma merge()

 1     if (prev && prev->vm_end == addr &&
 2               mpol_equal(vma_policy(prev), policy) &&
 3               can_vma_merge_after(prev, vm_flags,
 4                         anon_vma, file, pgoff)) {
 5         [snip]
 6     }
 7     if (next && end == next->vm_start &&
 8              mpol_equal(policy, vma_policy(next)) &&
 9              can_vma_merge_before(next, vm_flags,
10                     anon_vma, file, pgoff+pglen)) {
11         [snip]
12     }


               start                       end
     ...|--------|========|========|========|--------|...
        A        B        C        D        E        F
           prev      vma     next
                    prev     vma      next

                               First Attempt at Patching Kernel   11/19
Analysis - can vma merge before()

 1   static int
 2   can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
 3       struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 4   {
 5       if (is_mergeable_vma(vma, file, vm_flags) &&
 6           is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 7           if (vma->vm_pgoff == vm_pgoff)
 8               return 1;
 9       }
10       return 0;
11   }


               start                       end
     ...|--------|========|========|========|--------|...
        A        B        C vma    D        E        F
                          ˆvm_pgoff
     vma_merge(): (vma)     (next)
     vma_merge():ˆpgoff
     vma_merge():|-pglen -|

                                 First Attempt at Patching Kernel               12/19
Analysis - can vma merge after()

 1   static int
 2   can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 3       struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 4   {
 5       if (is_mergeable_vma(vma, file, vm_flags) &&
 6           is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 7           pgoff_t vm_pglen;
 8           vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 9           if (vma->vm_pgoff + vm_pglen == vm_pgoff)
10               return 1;
11       }
12       return 0;
13   }

               start                       end
     ...|--------|========|========|========|--------|...
        A         B vma    C        D       E        F
                  |-pglen -|
                           ˆvm_pgoff
                  ˆvma->vm_pgoff
     vma_merge(): (prev)
     vma_merge():          ˆpgoff
                                 First Attempt at Patching Kernel              13/19
Analysis - tracing
 1   mempolicy.c:
 2           vmstart = max(start, vma->vm_start);
 3           vmend = min(end, vma->vm_end);
 4           prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 5                     vma->anon_vma, vma->vm_file, pgoff, new_pol);
 6
 7   mmap.c:
 8   struct vm_area_struct *vma_merge(struct mm_struct *mm,
 9               struct vm_area_struct *prev, unsigned long addr,
10               unsigned long end, unsigned long vm_flags,
11                    struct anon_vma *anon_vma, struct file *file,
12               pgoff_t pgoff, struct mempolicy *policy)
13   {
14       pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
15
16       can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen);
17       [snip]
18   }
19
20   static int
21   can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
22       struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
23   {
24       if (is_mergeable_vma(vma, file, vm_flags) &&
25           is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
26           if (vma->vm_pgoff == vm_pgoff)
27               return 1;
28       }
29       return 0;
30   }

                                          First Attempt at Patching Kernel      14/19
Analysis - tracing (cont.)
   Wrong:

       pgoff = vma->vm_pgoff +
               ((start - vma->vm_start) >> PAGE_SHIFT);
       pgoff + e - s == next->vm_pgoff ?

              start                       end
    ...|--------|========|========|========|--------|...
   off:0        1        2        3        4        5
       A        B        C        D        E        F
   1.           ˆ   vma                         not merge
   2.           ˆ        s   vma e next         not merge
                pgoff = 2 + (1 - 2) = 1
                pgoff + 3 - 2 = 2 != 3



                      First Attempt at Patching Kernel      15/19
Analysis - tracing (cont.)
   Right:

       pgoff = vma->vm_pgoff;
       pgoff + e - s == next->vm_pgoff ?

              start                       end
    ...|--------|========|========|========|--------|...
   off:0        1        2        3        4        5
       A        B        C        D        E        F
   1.           ˆ   vma                         not merge
   2.                    ˆs vma e next          merge!
                pgoff = 2
                pgoff + 3 - 2 = 3 == 3




                      First Attempt at Patching Kernel      16/19
Patchwork

 1   diff --git a/mm/mempolicy.c b/mm/mempolicy.c
 2   index 8b57173..b1f70d6 100644
 3   --- a/mm/mempolicy.c
 4   +++ b/mm/mempolicy.c
 5   @@ -636,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 6        struct vm_area_struct *prev;
 7        struct vm_area_struct *vma;
 8        int err = 0;
 9   -    pgoff_t pgoff;
10        unsigned long vmstart;
11        unsigned long vmend;
12
13   @@ -649,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
14            vmstart = max(start, vma->vm_start);
15            vmend = min(end, vma->vm_end);
16
17   -       pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
18           prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
19   -                 vma->anon_vma, vma->vm_file, pgoff, new_pol);
20   +                 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
21   +                 new_pol);
22           if (prev) {
23               vma = prev;
24               next = vma->vm_next;




                                          First Attempt at Patching Kernel                   17/19
Questions?




First Attempt at Patching Kernel   18/19
Thank you!




First Attempt at Patching Kernel   19/19

Más contenido relacionado

Similar a Caspar's First Kernel Patch

Athenticated smaba server config with open vpn
Athenticated smaba server  config with open vpnAthenticated smaba server  config with open vpn
Athenticated smaba server config with open vpn
Chanaka Lasantha
 

Similar a Caspar's First Kernel Patch (20)

Scilab presentation
Scilab presentation Scilab presentation
Scilab presentation
 
MariaDB Server on macOS - FOSDEM 2022 MariaDB Devroom
MariaDB Server on macOS -  FOSDEM 2022 MariaDB DevroomMariaDB Server on macOS -  FOSDEM 2022 MariaDB Devroom
MariaDB Server on macOS - FOSDEM 2022 MariaDB Devroom
 
Virtual machines - how they work
Virtual machines - how they workVirtual machines - how they work
Virtual machines - how they work
 
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all started
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all startedKernel Recipes 2019 - ftrace: Where modifying a running kernel all started
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all started
 
When RV Meets CEP (RV 2016 Tutorial)
When RV Meets CEP (RV 2016 Tutorial)When RV Meets CEP (RV 2016 Tutorial)
When RV Meets CEP (RV 2016 Tutorial)
 
GTC16 - S6510 - Targeting GPUs with OpenMP 4.5
GTC16 - S6510 - Targeting GPUs with OpenMP 4.5GTC16 - S6510 - Targeting GPUs with OpenMP 4.5
GTC16 - S6510 - Targeting GPUs with OpenMP 4.5
 
Lecture#5 Operators in C++
Lecture#5 Operators in C++Lecture#5 Operators in C++
Lecture#5 Operators in C++
 
Migrating KSM page causes the VM lock up as the KSM page merging list is too ...
Migrating KSM page causes the VM lock up as the KSM page merging list is too ...Migrating KSM page causes the VM lock up as the KSM page merging list is too ...
Migrating KSM page causes the VM lock up as the KSM page merging list is too ...
 
Kernel crashdump
Kernel crashdumpKernel crashdump
Kernel crashdump
 
malloc & vmalloc in Linux
malloc & vmalloc in Linuxmalloc & vmalloc in Linux
malloc & vmalloc in Linux
 
Experience on porting HIGHMEM and KASAN to RISC-V at COSCUP 2020
Experience on porting HIGHMEM and KASAN to RISC-V at COSCUP 2020Experience on porting HIGHMEM and KASAN to RISC-V at COSCUP 2020
Experience on porting HIGHMEM and KASAN to RISC-V at COSCUP 2020
 
12c Mini Lesson - Inline PLSQL from SQL
12c Mini Lesson - Inline PLSQL from SQL12c Mini Lesson - Inline PLSQL from SQL
12c Mini Lesson - Inline PLSQL from SQL
 
Wap to implement bitwise operators
Wap to implement bitwise operatorsWap to implement bitwise operators
Wap to implement bitwise operators
 
Operating CloudStack: the easy way (automation!)
Operating CloudStack: the easy way (automation!)Operating CloudStack: the easy way (automation!)
Operating CloudStack: the easy way (automation!)
 
Solaris 10 Advanced Features.
Solaris 10 Advanced Features.Solaris 10 Advanced Features.
Solaris 10 Advanced Features.
 
OpenMP
OpenMPOpenMP
OpenMP
 
New Ways to Find Latency in Linux Using Tracing
New Ways to Find Latency in Linux Using TracingNew Ways to Find Latency in Linux Using Tracing
New Ways to Find Latency in Linux Using Tracing
 
COSCUP 2020 RISC-V 32 bit linux highmem porting
COSCUP 2020 RISC-V 32 bit linux highmem portingCOSCUP 2020 RISC-V 32 bit linux highmem porting
COSCUP 2020 RISC-V 32 bit linux highmem porting
 
Athenticated smaba server config with open vpn
Athenticated smaba server  config with open vpnAthenticated smaba server  config with open vpn
Athenticated smaba server config with open vpn
 
Adding Statistical Functionality to the DATA Step with PROC FCMP
Adding Statistical Functionality to the DATA Step with PROC FCMPAdding Statistical Functionality to the DATA Step with PROC FCMP
Adding Statistical Functionality to the DATA Step with PROC FCMP
 

Último

IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI Solutions
Enterprise Knowledge
 
EIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptx
EIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptxEIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptx
EIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptx
Earley Information Science
 
CNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of ServiceCNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of Service
giselly40
 

Último (20)

Presentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreterPresentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreter
 
Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...
 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
 
A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed texts
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)
 
From Event to Action: Accelerate Your Decision Making with Real-Time Automation
From Event to Action: Accelerate Your Decision Making with Real-Time AutomationFrom Event to Action: Accelerate Your Decision Making with Real-Time Automation
From Event to Action: Accelerate Your Decision Making with Real-Time Automation
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
 
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
 
What Are The Drone Anti-jamming Systems Technology?
What Are The Drone Anti-jamming Systems Technology?What Are The Drone Anti-jamming Systems Technology?
What Are The Drone Anti-jamming Systems Technology?
 
Breaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path MountBreaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path Mount
 
IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI Solutions
 
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
 
Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024
Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024
Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024
 
EIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptx
EIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptxEIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptx
EIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptx
 
CNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of ServiceCNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of Service
 
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
 
The Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptxThe Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptx
 

Caspar's First Kernel Patch

  • 1. First Attempt at Patching Kernel A Simple Fix of VMA Merging Issue Caspar Zhang @ linuxfb caspar@casparzhang.com September 19, 2011
  • 2. Agenda Background Issue Spotted Analysis Patchwork First Attempt at Patching Kernel 2/19
  • 3. Background Glossary: VMA < linux/mm types.h >: struct vm area struct $ cat/proc/ < pid > /maps mbind(): Set NUMA policy for a memory range Glossary: NUMA First Attempt at Patching Kernel 3/19
  • 4. Issue Spotted An upstream commit with reproducer. commit 9d8cebd4bcd7c3878462fdfda34bbcdeb4df7ef4 Author: KOSAKI Motohiro < kosaki.motohiro@jp. f u jitsu.com > Date: Fri Mar 5 13:41:57 2010 -0800 mm: fix mbind vma merge problem Strangely, current mbind() doesn’t merge vma with neighbor vma although it’s possible. Unfortunately, many vma can reduce performance... This patch fixes it. reproduced program ... First Attempt at Patching Kernel 4/19
  • 5. Reproducer 1 addr = mmap(NULL, pagesize*3, PROT_READ|PROT_WRITE, 2 MAP_ANON|MAP_PRIVATE, 0, 0); 3 if (addr == MAP_FAILED) 4 perror("mmap "), exit(1); 5 6 /* make page populate */ 7 memset(addr, 0, pagesize*3); 8 9 /* first mbind */ 10 err = mbind(addr+pagesize, pagesize, MPOL_BIND, nmask->maskp, 11 nmask->size, MPOL_MF_MOVE_ALL); 12 13 /* second mbind */ 14 err = mbind(addr, pagesize*3, MPOL_DEFAULT, NULL, 0, 0); mmap: |==========================| mbind1: ...|--------|========|oooooooo|========|--------|... mbind2: ...|--------|========|========|========|--------|... A B C D E F First Attempt at Patching Kernel 5/19
  • 6. Issue Spotted (cont.) An upstream commit with reproducer(cont.) result without this patch addr = 0x7fe26ef09000 [snip] 7fe26ef09000-7fe26ef0a000 rw-p 00000000 00:00 0 7fe26ef0a000-7fe26ef0b000 rw-p 00000000 00:00 0 7fe26ef0b000-7fe26ef0c000 rw-p 00000000 00:00 0 7fe26ef0c000-7fe26ef0d000 rw-p 00000000 00:00 0 => 0x7fe26ef09000-0x7fe26ef0c000 have three vmas. result with this patch addr = 0x7fc9ebc76000 [snip] 7fc9ebc76000-7fc9ebc7a000 rw-p 00000000 00:00 0 7fffbe690000-7fffbe6a5000 rw-p 00000000 00:00 0 [stack] => 0x7fc9ebc76000-0x7fc9ebc7a000 have only one vma. First Attempt at Patching Kernel 6/19
  • 7. Issue Spotted (cont.) port the reproducer to LTP — not work fix bug in LTP — still not work suspect Kernel bug First Attempt at Patching Kernel 7/19
  • 8. Analysis - mbind range() 1 /* Step 2: apply policy to a range and do splits. */ 2 static int mbind_range(struct mm_struct *mm, unsigned long start, 3 unsigned long end, struct mempolicy *new_pol) 4 { 5 struct vm_area_struct *next; 6 struct vm_area_struct *prev; 7 struct vm_area_struct *vma; 8 int err = 0; 9 pgoff_t pgoff; 10 unsigned long vmstart; 11 unsigned long vmend; 12 13 vma = find_vma_prev(mm, start, &prev); 14 if (!vma || vma->vm_start > start) 15 return -EFAULT; start end ...|--------|========|========|========|--------|... A prev B vma C D E F vma->start First Attempt at Patching Kernel 8/19
  • 9. Analysis - loop 1 for (; vma && vma->vm_start < end; prev = vma, vma = next) { 2 next = vma->vm_next; 3 vmstart = max(start, vma->vm_start); 4 vmend = min(end, vma->vm_end); 5 6 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 7 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 8 vma->anon_vma, vma->vm_file, pgoff, new_pol); 9 if (prev) { 10 vma = prev; 11 next = vma->vm_next; 12 continue; 13 } 14 [snip] 15 } start end ...|--------|========|========|========|--------|... A prev B vma C D E F vma->start First Attempt at Patching Kernel 9/19
  • 10. Analysis - snip part split if merged out of range: 1 vmstart = max(start, vma->vm_start); 2 vmend = min(end, vma->vm_end); 3 ... 4 5 if (vma->vm_start != vmstart) { 6 err = split_vma(vma->vm_mm, vma, vmstart, 1); 7 if (err) 8 goto out; 9 } 10 if (vma->vm_end != vmend) { 11 err = split_vma(vma->vm_mm, vma, vmend, 0); 12 if (err) 13 goto out; 14 } 15 err = policy_vma(vma, new_pol); 16 if (err) 17 goto out; First Attempt at Patching Kernel 10/19
  • 11. Analysis - vma merge() 1 if (prev && prev->vm_end == addr && 2 mpol_equal(vma_policy(prev), policy) && 3 can_vma_merge_after(prev, vm_flags, 4 anon_vma, file, pgoff)) { 5 [snip] 6 } 7 if (next && end == next->vm_start && 8 mpol_equal(policy, vma_policy(next)) && 9 can_vma_merge_before(next, vm_flags, 10 anon_vma, file, pgoff+pglen)) { 11 [snip] 12 } start end ...|--------|========|========|========|--------|... A B C D E F prev vma next prev vma next First Attempt at Patching Kernel 11/19
  • 12. Analysis - can vma merge before() 1 static int 2 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 3 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 4 { 5 if (is_mergeable_vma(vma, file, vm_flags) && 6 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 7 if (vma->vm_pgoff == vm_pgoff) 8 return 1; 9 } 10 return 0; 11 } start end ...|--------|========|========|========|--------|... A B C vma D E F ˆvm_pgoff vma_merge(): (vma) (next) vma_merge():ˆpgoff vma_merge():|-pglen -| First Attempt at Patching Kernel 12/19
  • 13. Analysis - can vma merge after() 1 static int 2 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 3 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 4 { 5 if (is_mergeable_vma(vma, file, vm_flags) && 6 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 7 pgoff_t vm_pglen; 8 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 9 if (vma->vm_pgoff + vm_pglen == vm_pgoff) 10 return 1; 11 } 12 return 0; 13 } start end ...|--------|========|========|========|--------|... A B vma C D E F |-pglen -| ˆvm_pgoff ˆvma->vm_pgoff vma_merge(): (prev) vma_merge(): ˆpgoff First Attempt at Patching Kernel 13/19
  • 14. Analysis - tracing 1 mempolicy.c: 2 vmstart = max(start, vma->vm_start); 3 vmend = min(end, vma->vm_end); 4 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 5 vma->anon_vma, vma->vm_file, pgoff, new_pol); 6 7 mmap.c: 8 struct vm_area_struct *vma_merge(struct mm_struct *mm, 9 struct vm_area_struct *prev, unsigned long addr, 10 unsigned long end, unsigned long vm_flags, 11 struct anon_vma *anon_vma, struct file *file, 12 pgoff_t pgoff, struct mempolicy *policy) 13 { 14 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 15 16 can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen); 17 [snip] 18 } 19 20 static int 21 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 22 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 23 { 24 if (is_mergeable_vma(vma, file, vm_flags) && 25 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 26 if (vma->vm_pgoff == vm_pgoff) 27 return 1; 28 } 29 return 0; 30 } First Attempt at Patching Kernel 14/19
  • 15. Analysis - tracing (cont.) Wrong: pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); pgoff + e - s == next->vm_pgoff ? start end ...|--------|========|========|========|--------|... off:0 1 2 3 4 5 A B C D E F 1. ˆ vma not merge 2. ˆ s vma e next not merge pgoff = 2 + (1 - 2) = 1 pgoff + 3 - 2 = 2 != 3 First Attempt at Patching Kernel 15/19
  • 16. Analysis - tracing (cont.) Right: pgoff = vma->vm_pgoff; pgoff + e - s == next->vm_pgoff ? start end ...|--------|========|========|========|--------|... off:0 1 2 3 4 5 A B C D E F 1. ˆ vma not merge 2. ˆs vma e next merge! pgoff = 2 pgoff + 3 - 2 = 3 == 3 First Attempt at Patching Kernel 16/19
  • 17. Patchwork 1 diff --git a/mm/mempolicy.c b/mm/mempolicy.c 2 index 8b57173..b1f70d6 100644 3 --- a/mm/mempolicy.c 4 +++ b/mm/mempolicy.c 5 @@ -636,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, 6 struct vm_area_struct *prev; 7 struct vm_area_struct *vma; 8 int err = 0; 9 - pgoff_t pgoff; 10 unsigned long vmstart; 11 unsigned long vmend; 12 13 @@ -649,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, 14 vmstart = max(start, vma->vm_start); 15 vmend = min(end, vma->vm_end); 16 17 - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 18 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 19 - vma->anon_vma, vma->vm_file, pgoff, new_pol); 20 + vma->anon_vma, vma->vm_file, vma->vm_pgoff, 21 + new_pol); 22 if (prev) { 23 vma = prev; 24 next = vma->vm_next; First Attempt at Patching Kernel 17/19
  • 18. Questions? First Attempt at Patching Kernel 18/19
  • 19. Thank you! First Attempt at Patching Kernel 19/19