2. Introduction
• Queues are everywhere in parallel applications
and operating systems
• Many researchers have proposed queues
–
–
–
–
Hwang and Briggs
Gottlieb
Massalin
Et al. etc…
• Queue performance can be critical to operating
system performance
– Scheduling Queues
– Free memory lists
– Many other critical kernel operations
3. Concurrent FIFO Queue algorithms
• Blocking algorithms risk performance
degradation
– A process can be delayed or halted at inopportune
moments
• Scheduling preemption
• Page faults
• Cache misses
– Slow processes can prevent faster ones from
completing indefinitely
• Non-Blocking algorithms must solve the ABA
problem
– During contention, some process will complete
within a given number of operations
4. ABA problem
Pop () {
loop
value = SM
newVal = value -1
THREAD1
THREAD2
data = Stack Data
CAS(&SM, value, newVal)
v1=Pop()
value = 5
Push (d) {
loop
value = SM
newVal = value +1
Stack Data = d
CAS(&SM, value, newVal)
break
Stack
SM
5
…
x
time
newVal = 4
data=X
return data
value = 5
newVal = 4
break
v2=Pop()
data=X
5. ABA problem
Pop () {
loop
value = SM
newVal = value -1
THREAD1
THREAD2
data = Stack Data
CAS(&SM, value, newVal)
v1=Pop()
value = 5
Push (d) {
loop
newVal = 4
data=X
return data
value = 5
newVal = 4
break
v2=Pop()
data=X
CAS(&SM,value,newVal)
value = SM
newVal = value +1
Stack Data = d
CAS(&SM, value, newVal)
break
Stack
SM
4
…
x
time
6. ABA problem
Pop () {
loop
value = SM
newVal = value -1
THREAD1
THREAD2
data = Stack Data
CAS(&SM, value, newVal)
v1=Pop()
value = 5
Push (d) {
loop
newVal = 4
data=X
return data
value = 5
newVal = 4
break
v2=Pop()
data=X
CAS(&SM,value,newVal)
value = SM
v2 = x
newVal = value +1
Stack Data = d
CAS(&SM, value, newVal)
break
Stack
SM
4
…
x
time
7. ABA problem
Pop () {
loop
value = SM
newVal = value -1
THREAD1
THREAD2
data = Stack Data
CAS(&SM, value, newVal)
v1=Pop()
value = 5
Push (d) {
loop
newVal = 4
data=X
return data
value = 5
newVal = 4
break
v2=Pop()
data=X
CAS(&SM,value,newVal)
value = SM
v2 = x
newVal = value +1
Push(z)
Stack Data = d
CAS(&SM, value, newVal)
break
Stack
SM
4
…
x
time
8. ABA problem
Pop () {
loop
value = SM
newVal = value -1
THREAD1
THREAD2
data = Stack Data
CAS(&SM, value, newVal)
v1=Pop()
value = 5
Push (d) {
loop
newVal = 4
data=X
return data
value = 5
newVal = 4
break
v2=Pop()
data=X
CAS(&SM,value,newVal)
value = SM
v2 = x
newVal = value +1
Stack Data = d
Push(z)
CAS(&SM, value, newVal)
value = 4
break
Stack
SM
4
…
x
time
9. ABA problem
Pop () {
loop
value = SM
newVal = value -1
THREAD1
THREAD2
data = Stack Data
CAS(&SM, value, newVal)
v1=Pop()
value = 5
Push (d) {
loop
newVal = 4
data=X
return data
value = 5
newVal = 4
break
v2=Pop()
data=X
CAS(&SM,value,newVal)
value = SM
v2 = x
newVal = value +1
Stack Data = d
Push(z)
CAS(&SM, value, newVal)
value = 4
break
newVal=5
Stack
SM
4
…
x
time
10. ABA problem
Pop () {
loop
value = SM
newVal = value -1
THREAD1
THREAD2
data = Stack Data
CAS(&SM, value, newVal)
v1=Pop()
value = 5
Push (d) {
loop
newVal = 4
data=X
return data
value = 5
newVal = 4
break
v2=Pop()
data=X
CAS(&SM,value,newVal)
value = SM
v2 = x
newVal = value +1
Stack Data = d
Push(z)
CAS(&SM, value, newVal)
value = 4
break
newVal=5
Stack
CAS(&SM,value,newVal)
SM
5
…
z
time
11. ABA problem
Pop () {
loop
value = SM
newVal = value -1
THREAD1
THREAD2
data = Stack Data
CAS(&SM, value, newVal)
v1=Pop()
value = 5
Push (d) {
newVal = 4
data=X
return data
value = 5
newVal = 4
break
v2=Pop()
data=X
loop
CAS(&SM,value,newVal)
value = SM
v2 = x
newVal = value +1
Stack Data = d
Push(z)
CAS(&SM, value, newVal)
value = 4
break
newVal=5
Stack
CAS(&SM,value,newVal)
CAS(&SM,value,newVal)
SM
5
…
z
time
12. ABA problem
Pop () {
loop
value = SM
newVal = value -1
THREAD1
THREAD2
data = Stack Data
CAS(&SM, value, newVal)
v1=Pop()
value = 5
Push (d) {
newVal = 4
data=X
return data
value = 5
newVal = 4
break
v2=Pop()
data=X
loop
CAS(&SM,value,newVal)
value = SM
v2 = x
newVal = value +1
Stack Data = d
Push(z)
CAS(&SM, value, newVal)
value = 4
break
newVal=5
Stack
CAS(&SM,value,newVal)
CAS(&SM,value,newVal)
v1=x
SM
4
…
z
time
13. ABA problem
Pop () {
loop
value = SM
newVal = value -1
THREAD1
THREAD2
data = Stack Data
CAS(&SM, value, newVal)
v1=Pop()
value = 5
Push (d) {
newVal = 4
data=X
return data
value = 5
newVal = 4
break
v2=Pop()
data=X
loop
CAS(&SM,value,newVal)
value = SM
v2 = x
newVal = value +1
Stack Data = d
Push(z)
CAS(&SM, value, newVal)
value = 4
break
newVal=5
Stack
CAS(&SM,value,newVal)
CAS(&SM,value,newVal)
v1=x
SM
4
…
z
CAS should fail but it succeeds
time
Thread1 has Thread2’s data
14. Solutions for ABA problem
Cache Kernel
• Add version # to data structures
• Increment # during every CAS instruction
LL/SC
• Fail if Cache Line has been written to
15. Solution for ABA problem
Pop () {
loop
value = SM
newVal = value -1
THREAD1
THREAD2
data = Stack Data
DCAS(&SM, value,
v1=Pop()
value = 5
return data
Push (d) {
newVal = 4
data=X
break
value = 5
newVal = 4
<ver++,newVal>)
v2=Pop()
data=X
loop
DCAS(&SM,value,ver,newVal)
value = SM
v2 = x
newVal = value +1
Stack Data = d
Push(z)
DCAS(&SM, value,
value = 4
<ver++,newVal>)
newVal=5
Stack
break
DCAS(&SM,value,ver,newVal)
DCAS(&SM,value,ver,newVal)
Will not incorrectly succeed
SM
5
…
z
(ver != ver+2)
time
16. Solution for ABA problem
Pop () {
loop
value = SM
newVal = value -1
THREAD1
THREAD2
data = Stack Data
DCAS(&SM, value,
v1=Pop()
value = 5
return data
Push (d) {
newVal = 4
data=Z
break
value = 5
newVal = 4
<ver++,newVal>)
v2=Pop()
data=X
loop
DCAS(&SM,value,ver,newVal)
value = SM
v2 = x
newVal = value +1
Stack Data = d
Push(z)
DCAS(&SM, value,
value = 4
<ver++,newVal>)
newVal=5
Stack
break
DCAS(&SM,value,ver,newVal)
DCAS(&SM,value,ver,newVal)
Will not incorrectly succeed
SM
4
(ver != ver+2)
…
time
V1 = Z
17. Correctness Properties
1.
2.
3.
4.
5.
The linked list is always connected
Nodes only inserted after the last node
Nodes only deleted from beginning
Head always points to the first node
Tail always points to a node in the list
52. struct node_t {
data_type value
node_t * next
}
struct queue_t {
pointer_t Head
pointer_t Tail
lock_type H_lock
lock_type T_lock
}
initialize(Q: pointer to queue t)
node = new node()
node–>next.ptr = NULL
Q–>Head = Q–>Tail = node
Q–>H lock = Q–>T lock = FREE
dequeue(Q: pointer to queue t, pvalue: pointer to data type): boolean
lock(&Q–>H lock)
node = Q–>Head
new head = node–>next
if new head == NULL
enqueue(Q: pointer to queue t, value: data type)
unlock(&Q–>H lock)
node = new node()
return FALSE
node–>value = value
endif
node–>next.ptr = NULL
*pvalue = new head–>value
lock(&Q–>T lock)
Q–>Head = new head
Q–>Tail–>next = node
unlock(&Q–>H lock)
Q–>Tail = node
free(node)
unlock(&Q–>T lock)
return TRUE
• Algorithms have same general structure only different
data types
• No loops, ‘busy waiting’ instead
• Only dequeues access Head Lock
• Only enqueues access Tail Lock
53. Performance Parameters
• Net execution time for one million
enqueue/dequeue pairs
• 12-processor Silicon Graphics Challenge
multiprocessor
• Algorithms compiled with using highest
optimization level
• Including many hand optimizations
55. Conclusion
• NBS clear winner for multiprocessor
multiprogrammed systems
• Above 5 processors, use the new nonblocking queue
• If hardware only supports test-and-set use
two lock queue
• For two or less processors use a single
lock algorithm for queues