SlideShare una empresa de Scribd logo
1 de 26
Descargar para leer sin conexión
A collection of
Micro Optimizations
  by Alex, Gabriel & Michael
Start
● Optimizations from hash.0.c to hash.13.c
● Performance testing:
     gcc -Wall -O3 hash.c -o hash
     perf stat -r 5 -e instructions -e branch-misses hash input input2
     perf stat -r 5 -e cycles hash input input2


● Result:
  Cycles:                     7.292.009.385
  Instructions:               1.063.178.278
  Branch mispredictions:      11.395.359
  Time elapsed:               2.2927 s
Analysis
● Hashtable
  ○   Max. Collisions:   7
  ○   Empty Elements:    363.543
  ○   Amount Elements:   1.048.575
  ○   Input Elements:    724.129


● Good hash table size: ~ 20% of Input
● Brent-Hashing?
● Parallelism?
Convert Linked-Lists to Arrays
● fewer cache misses on frequently used lookup

● overhead due to reorganizing

● struct size reduced from 24 to 16 bytes
  due to removing *next

● faster at large lists
   ○ break even point at HASHSIZE 2^18
Loop peeling: lookup
if(l != NULL) {
    if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0)
        return l->value;
    l = l->next;
    while (l!=NULL) {
      if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0)
        return l->value;
      l = l->next;
    }
  }
  return -1;

    Cycles:                       7.255.927.875 (-0,495%)
    Instructions:                 1.067.896.719 (+0,444%)
    Branch mispredictions:        11.464.124 (+0,603%)
    Time elapsed:                 2,1613 s (-5,731%)
Inline
inline   struct block slurp(char *filename)
inline   unsigned long hash(char *addr, size_t len)
inline   void insert(char *keyaddr, size_t keylen, int value)
inline   int lookup(char *keyaddr, size_t keylen)



    Cycles:                         7.265.216.080 (+0,128%)
    Instructions:                   1.067.543.945 (-0,033%)
    Branch mispredictions:          11.541.050 (+0,671%)
    Time elapsed:                   2,1672 s (+0,273%)
Replace loop with macro
#define REPEAT10(x) { x x x x x x x x x x }
REPEAT10 (
     for (p=input2.addr, endp=input2.addr+input2.len; p<endp; ) {
       ...
     }
  );


    Cycles:                       7.313.103.515 (+0,659%)
    Instructions:                 1.062.596.883 (-0,463%)
    Branch mispredictions:        11.423.373 (-1,020%)
    Time elapsed:                 2,1791 s (+0,549%)
Some Minor Changes
● new Makro HASHSIZE-1
● Remove unnecessary Casts


... with no effects
Loop peeling + adjust len
inline unsigned long hash(char *addr, size_t len) {

...

if(len > 7 ) {
    len = len - 7;
    x = (*(unsigned long *)addr)*hashmult;
    for (i=8; i<len; i+=8) {
      w = *(unsigned long *)(addr+i);
      x = (x + w)*hashmult;
    }
    len = len + 7;
  }

...
Loop peeling + adjust len


  Cycles:                  8.271.902.713 (+13,111%)
  Instructions:            1.038.690.398 (-2,250%)
  Branch mispredictions:   11.809.722 (+3,382%)
  Time elapsed:            2,4551 s (+12,668%)


=> probably faster for long strings
=> changes discarded
Pointers instead of indices
uint128_t x;
unsigned long * laddr = (unsigned long *) addr;
unsigned long * end = (unsigned long *) (addr+len);

if(len > 7 ) {
  x = *laddr * hashmult;
  end--;
  for (laddr++; laddr <= end; laddr++) {
    x = (x + *laddr)*hashmult;
  }
  if (laddr < (end+1))
    x = ( x + ((*laddr)<< ( ((char*)laddr - (char*)end)*8)) ) * hashmult;
  return x+(x>>64);
} else if (laddr < end) {
  x = (uint128_t)((*laddr)<<((8-len)*8)) * hashmult;
  return x+(x>>64);
}

return 0;
Pointers instead of indices

  Cycles:                  8.253.559.129 (+12,860%)
  Instructions:            1.021.822.315 (-3,837%)
  Branch misprecditions:   1.1825.252 (+3,518%)
  Time elapsed:            2,4558 s (+12,700%)



=> probably faster for long strings
=> changes discarded
Improve loop-layout
for (p=input1.addr, endp=input1.addr+input1.len, i=0; p<endp; i++) {
    nextp=memchr(p, 'n', endp-p);
      if (nextp == NULL)
        break;
      ...
  }
------------------------------------------------
for (p=input.addr, endp=input.addr+input.len, r=0,
     nextp=memchr(p, 'n', endp-p); nextp != NULL;
       r++, nextp=memchr(p, 'n', endp-p)) {
       ...
}
Improve loop-layout
  Cycles:                  7.364.723.755 (+0,705%)
  Instructions:            1072512560 (+0,933%)
  Branch mispredictions:   11606354 (+1,601%)
  Time elapsed:            2,2509 s (+3,294%)


=> "if" and "&&" probably similar instructions
   in this case
Remove unnecessary check
for (p=input.addr, endp=input.addr+input.len, r=0,
     nextp=memchr(p, 'n', endp-p); nextp != NULL;
       r++, nextp=memchr(p, 'n', endp-p)) {
       ...
}




Remove unnecessary variables
struct block input1, input2;                         struct block input;

unsigned int i;                                      unsigned long r=0;
unsigned long r=0;
Remove unnecessary
check & variables


 Cycles:                  7323904385 (-0,554%)
 Instructions:            1064977111 (-0,702%)
 Branch mispredictions:   11734428 (+1,103%)
 Time elapsed:            2,2129 s (-1,688%)
Sentinel with rawmemchr
 ●   Idea:
      ○ replace '0' with 'n' at the end
      ○ use rawmemchr without length check instead
      ○ safe compares

endp=input1.addr+input1.len;
 *endp = 'n';

for (p=input1.addr, i=0, nextp=rawmemchr(p, 'n'); p<endp ; i++) {
  nextp=rawmemchr(p, 'n');
  insert(p, nextp-p, i);
  p = nextp+1;
}
Sentinel self made rawmemchr
endp=input.addr+input.len;
*endp = 'n';
p=input.addr;
nextp = p;

for (r=0; nextp<endp; r++) {
  for(;*nextp ^ 'n'; nextp++);
  insert(p, nextp-p, r);
  nextp++;
  p = nextp;
}

  Cycles:                         7.400.275.087 (+1,042%)
  Instructions:                   1.157.591.866 (+8,696%)
  Branch mispredictions:          11.715.914 (-0,158%)
  Time elapsed:                   2,2064 s (-0,293%)
Faster memcmp
inline int mycmp(char* in1, char* in2, int len){
  do{
    if(*in1 ^ *in2) return 0;
    in1++; in2++; len--;
  }while(len>0);
  return 1;
}

if (keylen == l->keylen && mycmp(l->keyaddr, keyaddr, keylen))


    Cycles:                       5.826.523.410 (-21,266%)
    Instructions:                 1.913.851.749 (+65,330%)
    Branch mispredictions:        14.810.147 (+26,410%)
    Time elapsed:                 1,7366 s (-21,292%)
Faster memcmp with Sentinel
*(keyaddr+keylen) = 0; // FROM INSERT

inline int mycmp(char* in1, char* in2, int len){
  while(*in1 == *in2) {
    in1++; in2++; len--;
  }
  return len;
}

if (keylen==l->keylen && !mycmp(l->keyaddr, keyaddr, keylen)) // FROM
LOOKUP


    Cycles:                       5.766.254.891 (-22,080%)
    Instructions:                 1.747.135.165 (+50,928%)
    Branch mispredictions:        14.772.984 (+26,093%)
    Time elapsed:                 1,7182 s (-22,126%)
Caching
int *cache = malloc(size*sizeof(int));       endcache = cache;
int *startcache, *endcache;
startcache = cache;                           REPEAT9 (
endcache = startcache + size;                   cache = startcache;
                                                while (cache < endcache) {
while(nextp<endp) {                               r = r * 2654435761L +
  if (cache >= endcache){                     *cache;
    size = size<<1;                               r = r + (r>>32);
    cache = realloc(cache, size*sizeof(int));     cache++;
  }                                             } );
    for(;*nextp ^ 'n'; nextp++);
    *cache = lookup(p, nextp-p);
    r = r * 2654435761L + *cache;
    r = r + (r>>32);
    cache++; nextp++; p = nextp;
}
Caching + memcmp

 Cycles:                  925.886.063 (-84,109%)
 Instructions:            494.630.615 (-74,155%)
 Branch mispredictions:   2.395.446 (-83,825%)
 Time elapsed:            0,2847 s (-83,603%)
Caching + memcmp with Sentinel

 Cycles:                  925.783.880 (-84,110%)
 Instructions:            475.125.520 (-75,172%)
 Branch mispredictions:   2.418.936 (-83,659%)
 Time elapsed:            0,2839 s (-83,738%)
Approximation of cache size

int size = input.len/6;                       int size = input.len/2;
...
if (cache >= endcache){
  size = size<<1;
  cache = realloc(cache, size*sizeof(int));
}
...


    Cycles:                       930.929.061 (+0,544%)
    Instructions:                 475.676.977 (-3,831%)
    Branch mispredictions:        2.384.999 (-0,436%)
    Time elapsed:                 0,2830 s (-0,586%)
Overall

  Cycles:                  930.929.061 (-87,233%)
  Instructions:            475.676.977 (-55,259%)
  Branch mispredictions:   2.384.999 (-79.070%)
  Time elapsed:            0,2830 s (-87,656%)
Fin
Any Questions?




Code available:
https://github.com/grill/micro-optimisations

Más contenido relacionado

La actualidad más candente

PythonScripting
PythonScriptingPythonScripting
PythonScripting
Sait Elmas
 
The Weather of the Century Part 2: High Performance
The Weather of the Century Part 2: High PerformanceThe Weather of the Century Part 2: High Performance
The Weather of the Century Part 2: High Performance
MongoDB
 
Arna Friend Controls II Final
Arna Friend Controls II FinalArna Friend Controls II Final
Arna Friend Controls II Final
Arna Friend
 

La actualidad más candente (20)

Use C++ to Manipulate mozSettings in Gecko
Use C++ to Manipulate mozSettings in GeckoUse C++ to Manipulate mozSettings in Gecko
Use C++ to Manipulate mozSettings in Gecko
 
Dun ddd
Dun dddDun ddd
Dun ddd
 
Zone.js 2017
Zone.js 2017Zone.js 2017
Zone.js 2017
 
Time Series Analysis for Network Secruity
Time Series Analysis for Network SecruityTime Series Analysis for Network Secruity
Time Series Analysis for Network Secruity
 
PythonScripting
PythonScriptingPythonScripting
PythonScripting
 
R and cpp
R and cppR and cpp
R and cpp
 
zen and the art of SQL optimization
zen and the art of SQL optimizationzen and the art of SQL optimization
zen and the art of SQL optimization
 
PostgreSQL query planner's internals
PostgreSQL query planner's internalsPostgreSQL query planner's internals
PostgreSQL query planner's internals
 
Kubernetes Tutorial
Kubernetes TutorialKubernetes Tutorial
Kubernetes Tutorial
 
The Weather of the Century Part 2: High Performance
The Weather of the Century Part 2: High PerformanceThe Weather of the Century Part 2: High Performance
The Weather of the Century Part 2: High Performance
 
Detection of errors and potential vulnerabilities in C and C++ code using the...
Detection of errors and potential vulnerabilities in C and C++ code using the...Detection of errors and potential vulnerabilities in C and C++ code using the...
Detection of errors and potential vulnerabilities in C and C++ code using the...
 
Effective Modern C++ - Item 35 & 36
Effective Modern C++ - Item 35 & 36Effective Modern C++ - Item 35 & 36
Effective Modern C++ - Item 35 & 36
 
Arna Friend Controls II Final
Arna Friend Controls II FinalArna Friend Controls II Final
Arna Friend Controls II Final
 
Apply Hammer Directly to Thumb; Avoiding Apache Spark and Cassandra AntiPatt...
 Apply Hammer Directly to Thumb; Avoiding Apache Spark and Cassandra AntiPatt... Apply Hammer Directly to Thumb; Avoiding Apache Spark and Cassandra AntiPatt...
Apply Hammer Directly to Thumb; Avoiding Apache Spark and Cassandra AntiPatt...
 
Code vectorization for mobile devices
Code vectorization for mobile devicesCode vectorization for mobile devices
Code vectorization for mobile devices
 
Bind Peeking - The Endless Tuning Nightmare
Bind Peeking - The Endless Tuning NightmareBind Peeking - The Endless Tuning Nightmare
Bind Peeking - The Endless Tuning Nightmare
 
Exploring Parallel Merging In GPU Based Systems Using CUDA C.
Exploring Parallel Merging In GPU Based Systems Using CUDA C.Exploring Parallel Merging In GPU Based Systems Using CUDA C.
Exploring Parallel Merging In GPU Based Systems Using CUDA C.
 
BGP communities and geotags
BGP communities and geotagsBGP communities and geotags
BGP communities and geotags
 
Valerii Vasylkov Erlang. measurements and benefits.
Valerii Vasylkov Erlang. measurements and benefits.Valerii Vasylkov Erlang. measurements and benefits.
Valerii Vasylkov Erlang. measurements and benefits.
 
Scoped dynamic rewrite rules
Scoped dynamic rewrite rulesScoped dynamic rewrite rules
Scoped dynamic rewrite rules
 

Similar a Efficient Programs

Performance tweaks and tools for Linux (Joe Damato)
Performance tweaks and tools for Linux (Joe Damato)Performance tweaks and tools for Linux (Joe Damato)
Performance tweaks and tools for Linux (Joe Damato)
Ontico
 
HW 5-RSAascii2str.mfunction str = ascii2str(ascii) .docx
HW 5-RSAascii2str.mfunction str = ascii2str(ascii)        .docxHW 5-RSAascii2str.mfunction str = ascii2str(ascii)        .docx
HW 5-RSAascii2str.mfunction str = ascii2str(ascii) .docx
wellesleyterresa
 
Parallel R in snow (english after 2nd slide)
Parallel R in snow (english after 2nd slide)Parallel R in snow (english after 2nd slide)
Parallel R in snow (english after 2nd slide)
Cdiscount
 
Profiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf ToolsProfiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf Tools
emBO_Conference
 

Similar a Efficient Programs (20)

Nodejs性能分析优化和分布式设计探讨
Nodejs性能分析优化和分布式设计探讨Nodejs性能分析优化和分布式设计探讨
Nodejs性能分析优化和分布式设计探讨
 
Performance tweaks and tools for Linux (Joe Damato)
Performance tweaks and tools for Linux (Joe Damato)Performance tweaks and tools for Linux (Joe Damato)
Performance tweaks and tools for Linux (Joe Damato)
 
Tracing Parallel Execution (UKOUG 2006)
Tracing Parallel Execution (UKOUG 2006)Tracing Parallel Execution (UKOUG 2006)
Tracing Parallel Execution (UKOUG 2006)
 
Performance
PerformancePerformance
Performance
 
Parallel Computing with R
Parallel Computing with RParallel Computing with R
Parallel Computing with R
 
HW 5-RSAascii2str.mfunction str = ascii2str(ascii) .docx
HW 5-RSAascii2str.mfunction str = ascii2str(ascii)        .docxHW 5-RSAascii2str.mfunction str = ascii2str(ascii)        .docx
HW 5-RSAascii2str.mfunction str = ascii2str(ascii) .docx
 
Debugging Ruby
Debugging RubyDebugging Ruby
Debugging Ruby
 
Ping to Pong
Ping to PongPing to Pong
Ping to Pong
 
Dive into EXPLAIN - PostgreSql
Dive into EXPLAIN  - PostgreSqlDive into EXPLAIN  - PostgreSql
Dive into EXPLAIN - PostgreSql
 
Write Python for Speed
Write Python for SpeedWrite Python for Speed
Write Python for Speed
 
Debugging Ruby Systems
Debugging Ruby SystemsDebugging Ruby Systems
Debugging Ruby Systems
 
Parallel R in snow (english after 2nd slide)
Parallel R in snow (english after 2nd slide)Parallel R in snow (english after 2nd slide)
Parallel R in snow (english after 2nd slide)
 
Joker 2015 - Валеев Тагир - Что же мы измеряем?
Joker 2015 - Валеев Тагир - Что же мы измеряем?Joker 2015 - Валеев Тагир - Что же мы измеряем?
Joker 2015 - Валеев Тагир - Что же мы измеряем?
 
Rkf
RkfRkf
Rkf
 
PVS-Studio team experience: checking various open source projects, or mistake...
PVS-Studio team experience: checking various open source projects, or mistake...PVS-Studio team experience: checking various open source projects, or mistake...
PVS-Studio team experience: checking various open source projects, or mistake...
 
Windbg랑 친해지기
Windbg랑 친해지기Windbg랑 친해지기
Windbg랑 친해지기
 
Auto
AutoAuto
Auto
 
lecture7.ppt
lecture7.pptlecture7.ppt
lecture7.ppt
 
Accelerating microbiome research with OpenACC
Accelerating microbiome research with OpenACCAccelerating microbiome research with OpenACC
Accelerating microbiome research with OpenACC
 
Profiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf ToolsProfiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf Tools
 

Último

Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
Joaquim Jorge
 

Último (20)

ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemkeProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
 
From Event to Action: Accelerate Your Decision Making with Real-Time Automation
From Event to Action: Accelerate Your Decision Making with Real-Time AutomationFrom Event to Action: Accelerate Your Decision Making with Real-Time Automation
From Event to Action: Accelerate Your Decision Making with Real-Time Automation
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
 
08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking Men08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking Men
 
Presentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreterPresentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreter
 
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
 
Exploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone ProcessorsExploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone Processors
 
[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed texts
 
GenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationGenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day Presentation
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
 
Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...
 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdf
 
How to convert PDF to text with Nanonets
How to convert PDF to text with NanonetsHow to convert PDF to text with Nanonets
How to convert PDF to text with Nanonets
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt Robison
 
Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organization
 
GenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdfGenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdf
 

Efficient Programs

  • 1. A collection of Micro Optimizations by Alex, Gabriel & Michael
  • 2. Start ● Optimizations from hash.0.c to hash.13.c ● Performance testing: gcc -Wall -O3 hash.c -o hash perf stat -r 5 -e instructions -e branch-misses hash input input2 perf stat -r 5 -e cycles hash input input2 ● Result: Cycles: 7.292.009.385 Instructions: 1.063.178.278 Branch mispredictions: 11.395.359 Time elapsed: 2.2927 s
  • 3. Analysis ● Hashtable ○ Max. Collisions: 7 ○ Empty Elements: 363.543 ○ Amount Elements: 1.048.575 ○ Input Elements: 724.129 ● Good hash table size: ~ 20% of Input ● Brent-Hashing? ● Parallelism?
  • 4. Convert Linked-Lists to Arrays ● fewer cache misses on frequently used lookup ● overhead due to reorganizing ● struct size reduced from 24 to 16 bytes due to removing *next ● faster at large lists ○ break even point at HASHSIZE 2^18
  • 5. Loop peeling: lookup if(l != NULL) { if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0) return l->value; l = l->next; while (l!=NULL) { if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0) return l->value; l = l->next; } } return -1; Cycles: 7.255.927.875 (-0,495%) Instructions: 1.067.896.719 (+0,444%) Branch mispredictions: 11.464.124 (+0,603%) Time elapsed: 2,1613 s (-5,731%)
  • 6. Inline inline struct block slurp(char *filename) inline unsigned long hash(char *addr, size_t len) inline void insert(char *keyaddr, size_t keylen, int value) inline int lookup(char *keyaddr, size_t keylen) Cycles: 7.265.216.080 (+0,128%) Instructions: 1.067.543.945 (-0,033%) Branch mispredictions: 11.541.050 (+0,671%) Time elapsed: 2,1672 s (+0,273%)
  • 7. Replace loop with macro #define REPEAT10(x) { x x x x x x x x x x } REPEAT10 ( for (p=input2.addr, endp=input2.addr+input2.len; p<endp; ) { ... } ); Cycles: 7.313.103.515 (+0,659%) Instructions: 1.062.596.883 (-0,463%) Branch mispredictions: 11.423.373 (-1,020%) Time elapsed: 2,1791 s (+0,549%)
  • 8. Some Minor Changes ● new Makro HASHSIZE-1 ● Remove unnecessary Casts ... with no effects
  • 9. Loop peeling + adjust len inline unsigned long hash(char *addr, size_t len) { ... if(len > 7 ) { len = len - 7; x = (*(unsigned long *)addr)*hashmult; for (i=8; i<len; i+=8) { w = *(unsigned long *)(addr+i); x = (x + w)*hashmult; } len = len + 7; } ...
  • 10. Loop peeling + adjust len Cycles: 8.271.902.713 (+13,111%) Instructions: 1.038.690.398 (-2,250%) Branch mispredictions: 11.809.722 (+3,382%) Time elapsed: 2,4551 s (+12,668%) => probably faster for long strings => changes discarded
  • 11. Pointers instead of indices uint128_t x; unsigned long * laddr = (unsigned long *) addr; unsigned long * end = (unsigned long *) (addr+len); if(len > 7 ) { x = *laddr * hashmult; end--; for (laddr++; laddr <= end; laddr++) { x = (x + *laddr)*hashmult; } if (laddr < (end+1)) x = ( x + ((*laddr)<< ( ((char*)laddr - (char*)end)*8)) ) * hashmult; return x+(x>>64); } else if (laddr < end) { x = (uint128_t)((*laddr)<<((8-len)*8)) * hashmult; return x+(x>>64); } return 0;
  • 12. Pointers instead of indices Cycles: 8.253.559.129 (+12,860%) Instructions: 1.021.822.315 (-3,837%) Branch misprecditions: 1.1825.252 (+3,518%) Time elapsed: 2,4558 s (+12,700%) => probably faster for long strings => changes discarded
  • 13. Improve loop-layout for (p=input1.addr, endp=input1.addr+input1.len, i=0; p<endp; i++) { nextp=memchr(p, 'n', endp-p); if (nextp == NULL) break; ... } ------------------------------------------------ for (p=input.addr, endp=input.addr+input.len, r=0, nextp=memchr(p, 'n', endp-p); nextp != NULL; r++, nextp=memchr(p, 'n', endp-p)) { ... }
  • 14. Improve loop-layout Cycles: 7.364.723.755 (+0,705%) Instructions: 1072512560 (+0,933%) Branch mispredictions: 11606354 (+1,601%) Time elapsed: 2,2509 s (+3,294%) => "if" and "&&" probably similar instructions in this case
  • 15. Remove unnecessary check for (p=input.addr, endp=input.addr+input.len, r=0, nextp=memchr(p, 'n', endp-p); nextp != NULL; r++, nextp=memchr(p, 'n', endp-p)) { ... } Remove unnecessary variables struct block input1, input2; struct block input; unsigned int i; unsigned long r=0; unsigned long r=0;
  • 16. Remove unnecessary check & variables Cycles: 7323904385 (-0,554%) Instructions: 1064977111 (-0,702%) Branch mispredictions: 11734428 (+1,103%) Time elapsed: 2,2129 s (-1,688%)
  • 17. Sentinel with rawmemchr ● Idea: ○ replace '0' with 'n' at the end ○ use rawmemchr without length check instead ○ safe compares endp=input1.addr+input1.len; *endp = 'n'; for (p=input1.addr, i=0, nextp=rawmemchr(p, 'n'); p<endp ; i++) { nextp=rawmemchr(p, 'n'); insert(p, nextp-p, i); p = nextp+1; }
  • 18. Sentinel self made rawmemchr endp=input.addr+input.len; *endp = 'n'; p=input.addr; nextp = p; for (r=0; nextp<endp; r++) { for(;*nextp ^ 'n'; nextp++); insert(p, nextp-p, r); nextp++; p = nextp; } Cycles: 7.400.275.087 (+1,042%) Instructions: 1.157.591.866 (+8,696%) Branch mispredictions: 11.715.914 (-0,158%) Time elapsed: 2,2064 s (-0,293%)
  • 19. Faster memcmp inline int mycmp(char* in1, char* in2, int len){ do{ if(*in1 ^ *in2) return 0; in1++; in2++; len--; }while(len>0); return 1; } if (keylen == l->keylen && mycmp(l->keyaddr, keyaddr, keylen)) Cycles: 5.826.523.410 (-21,266%) Instructions: 1.913.851.749 (+65,330%) Branch mispredictions: 14.810.147 (+26,410%) Time elapsed: 1,7366 s (-21,292%)
  • 20. Faster memcmp with Sentinel *(keyaddr+keylen) = 0; // FROM INSERT inline int mycmp(char* in1, char* in2, int len){ while(*in1 == *in2) { in1++; in2++; len--; } return len; } if (keylen==l->keylen && !mycmp(l->keyaddr, keyaddr, keylen)) // FROM LOOKUP Cycles: 5.766.254.891 (-22,080%) Instructions: 1.747.135.165 (+50,928%) Branch mispredictions: 14.772.984 (+26,093%) Time elapsed: 1,7182 s (-22,126%)
  • 21. Caching int *cache = malloc(size*sizeof(int)); endcache = cache; int *startcache, *endcache; startcache = cache; REPEAT9 ( endcache = startcache + size; cache = startcache; while (cache < endcache) { while(nextp<endp) { r = r * 2654435761L + if (cache >= endcache){ *cache; size = size<<1; r = r + (r>>32); cache = realloc(cache, size*sizeof(int)); cache++; } } ); for(;*nextp ^ 'n'; nextp++); *cache = lookup(p, nextp-p); r = r * 2654435761L + *cache; r = r + (r>>32); cache++; nextp++; p = nextp; }
  • 22. Caching + memcmp Cycles: 925.886.063 (-84,109%) Instructions: 494.630.615 (-74,155%) Branch mispredictions: 2.395.446 (-83,825%) Time elapsed: 0,2847 s (-83,603%)
  • 23. Caching + memcmp with Sentinel Cycles: 925.783.880 (-84,110%) Instructions: 475.125.520 (-75,172%) Branch mispredictions: 2.418.936 (-83,659%) Time elapsed: 0,2839 s (-83,738%)
  • 24. Approximation of cache size int size = input.len/6; int size = input.len/2; ... if (cache >= endcache){ size = size<<1; cache = realloc(cache, size*sizeof(int)); } ... Cycles: 930.929.061 (+0,544%) Instructions: 475.676.977 (-3,831%) Branch mispredictions: 2.384.999 (-0,436%) Time elapsed: 0,2830 s (-0,586%)
  • 25. Overall Cycles: 930.929.061 (-87,233%) Instructions: 475.676.977 (-55,259%) Branch mispredictions: 2.384.999 (-79.070%) Time elapsed: 0,2830 s (-87,656%)