psync2.tcl 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. start_server {tags {"psync2"}} {
  2. start_server {} {
  3. start_server {} {
  4. start_server {} {
  5. start_server {} {
  6. set master_id 0 ; # Current master
  7. set start_time [clock seconds] ; # Test start time
  8. set counter_value 0 ; # Current value of the Redis counter "x"
  9. # Config
  10. set debug_msg 0 ; # Enable additional debug messages
  11. set no_exit 0 ; # Do not exit at end of the test
  12. set duration 20 ; # Total test seconds
  13. set genload 1 ; # Load master with writes at every cycle
  14. set genload_time 5000 ; # Writes duration time in ms
  15. set disconnect 1 ; # Break replication link between random
  16. # master and slave instances while the
  17. # master is loaded with writes.
  18. set disconnect_period 1000 ; # Disconnect repl link every N ms.
  19. for {set j 0} {$j < 5} {incr j} {
  20. set R($j) [srv [expr 0-$j] client]
  21. set R_host($j) [srv [expr 0-$j] host]
  22. set R_port($j) [srv [expr 0-$j] port]
  23. if {$debug_msg} {puts "Log file: [srv [expr 0-$j] stdout]"}
  24. }
  25. set cycle 1
  26. while {([clock seconds]-$start_time) < $duration} {
  27. test "PSYNC2: --- CYCLE $cycle ---" {}
  28. incr cycle
  29. # Create a random replication layout.
  30. # Start with switching master (this simulates a failover).
  31. # 1) Select the new master.
  32. set master_id [randomInt 5]
  33. set used [list $master_id]
  34. test "PSYNC2: \[NEW LAYOUT\] Set #$master_id as master" {
  35. $R($master_id) slaveof no one
  36. if {$counter_value == 0} {
  37. $R($master_id) set x $counter_value
  38. }
  39. }
  40. # 2) Attach all the slaves to a random instance
  41. while {[llength $used] != 5} {
  42. while 1 {
  43. set slave_id [randomInt 5]
  44. if {[lsearch -exact $used $slave_id] == -1} break
  45. }
  46. set rand [randomInt [llength $used]]
  47. set mid [lindex $used $rand]
  48. set master_host $R_host($mid)
  49. set master_port $R_port($mid)
  50. test "PSYNC2: Set #$slave_id to replicate from #$mid" {
  51. $R($slave_id) slaveof $master_host $master_port
  52. }
  53. lappend used $slave_id
  54. }
  55. # 3) Increment the counter and wait for all the instances
  56. # to converge.
  57. test "PSYNC2: cluster is consistent after failover" {
  58. $R($master_id) incr x; incr counter_value
  59. for {set j 0} {$j < 5} {incr j} {
  60. wait_for_condition 50 1000 {
  61. [$R($j) get x] == $counter_value
  62. } else {
  63. fail "Instance #$j x variable is inconsistent"
  64. }
  65. }
  66. }
  67. # 4) Generate load while breaking the connection of random
  68. # slave-master pairs.
  69. test "PSYNC2: generate load while killing replication links" {
  70. set t [clock milliseconds]
  71. set next_break [expr {$t+$disconnect_period}]
  72. while {[clock milliseconds]-$t < $genload_time} {
  73. if {$genload} {
  74. $R($master_id) incr x; incr counter_value
  75. }
  76. if {[clock milliseconds] == $next_break} {
  77. set next_break \
  78. [expr {[clock milliseconds]+$disconnect_period}]
  79. set slave_id [randomInt 5]
  80. if {$disconnect} {
  81. $R($slave_id) client kill type master
  82. if {$debug_msg} {
  83. puts "+++ Breaking link for replica #$slave_id"
  84. }
  85. }
  86. }
  87. }
  88. }
  89. # 5) Increment the counter and wait for all the instances
  90. set x [$R($master_id) get x]
  91. test "PSYNC2: cluster is consistent after load (x = $x)" {
  92. for {set j 0} {$j < 5} {incr j} {
  93. wait_for_condition 50 1000 {
  94. [$R($j) get x] == $counter_value
  95. } else {
  96. fail "Instance #$j x variable is inconsistent"
  97. }
  98. }
  99. }
  100. # Put down the old master so that it cannot generate more
  101. # replication stream, this way in the next master switch, the time at
  102. # which we move slaves away is not important, each will have full
  103. # history (otherwise PINGs will make certain slaves have more history),
  104. # and sometimes a full resync will be needed.
  105. $R($master_id) slaveof 127.0.0.1 0 ;# We use port zero to make it fail.
  106. if {$debug_msg} {
  107. for {set j 0} {$j < 5} {incr j} {
  108. puts "$j: sync_full: [status $R($j) sync_full]"
  109. puts "$j: id1 : [status $R($j) master_replid]:[status $R($j) master_repl_offset]"
  110. puts "$j: id2 : [status $R($j) master_replid2]:[status $R($j) second_repl_offset]"
  111. puts "$j: backlog : firstbyte=[status $R($j) repl_backlog_first_byte_offset] len=[status $R($j) repl_backlog_histlen]"
  112. puts "---"
  113. }
  114. }
  115. test "PSYNC2: total sum of full synchronizations is exactly 4" {
  116. set sum 0
  117. for {set j 0} {$j < 5} {incr j} {
  118. incr sum [status $R($j) sync_full]
  119. }
  120. assert {$sum == 4}
  121. }
  122. # Limit anyway the maximum number of cycles. This is useful when the
  123. # test is skipped via --only option of the test suite. In that case
  124. # we don't want to see many seconds of this test being just skipped.
  125. if {$cycle > 50} break
  126. }
  127. test "PSYNC2: Bring the master back again for next test" {
  128. $R($master_id) slaveof no one
  129. set master_host $R_host($master_id)
  130. set master_port $R_port($master_id)
  131. for {set j 0} {$j < 5} {incr j} {
  132. if {$j == $master_id} continue
  133. $R($j) slaveof $master_host $master_port
  134. }
  135. # Wait for slaves to sync
  136. wait_for_condition 50 1000 {
  137. [status $R($master_id) connected_slaves] == 4
  138. } else {
  139. fail "Replica not reconnecting"
  140. }
  141. }
  142. test "PSYNC2: Partial resync after restart using RDB aux fields" {
  143. # Pick a random slave
  144. set slave_id [expr {($master_id+1)%5}]
  145. set sync_count [status $R($master_id) sync_full]
  146. set sync_partial [status $R($master_id) sync_partial_ok]
  147. catch {
  148. $R($slave_id) config rewrite
  149. $R($slave_id) debug restart
  150. }
  151. # note: just waiting for connected_slaves==4 has a race condition since
  152. # we might do the check before the master realized that the slave disconnected
  153. wait_for_condition 50 1000 {
  154. [status $R($master_id) sync_partial_ok] == $sync_partial + 1
  155. } else {
  156. fail "Replica not reconnecting"
  157. }
  158. set new_sync_count [status $R($master_id) sync_full]
  159. assert {$sync_count == $new_sync_count}
  160. }
  161. test "PSYNC2: Replica RDB restart with EVALSHA in backlog issue #4483" {
  162. # Pick a random slave
  163. set slave_id [expr {($master_id+1)%5}]
  164. set sync_count [status $R($master_id) sync_full]
  165. # Make sure to replicate the first EVAL while the salve is online
  166. # so that it's part of the scripts the master believes it's safe
  167. # to propagate as EVALSHA.
  168. $R($master_id) EVAL {return redis.call("incr","__mycounter")} 0
  169. $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
  170. # Wait for the two to sync
  171. wait_for_condition 50 1000 {
  172. [$R($master_id) debug digest] == [$R($slave_id) debug digest]
  173. } else {
  174. fail "Replica not reconnecting"
  175. }
  176. # Prevent the slave from receiving master updates, and at
  177. # the same time send a new script several times to the
  178. # master, so that we'll end with EVALSHA into the backlog.
  179. $R($slave_id) slaveof 127.0.0.1 0
  180. $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
  181. $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
  182. $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
  183. catch {
  184. $R($slave_id) config rewrite
  185. $R($slave_id) debug restart
  186. }
  187. # Reconfigure the slave correctly again, when it's back online.
  188. set retry 50
  189. while {$retry} {
  190. if {[catch {
  191. $R($slave_id) slaveof $master_host $master_port
  192. }]} {
  193. after 1000
  194. } else {
  195. break
  196. }
  197. incr retry -1
  198. }
  199. # The master should be back at 4 slaves eventually
  200. wait_for_condition 50 1000 {
  201. [status $R($master_id) connected_slaves] == 4
  202. } else {
  203. fail "Replica not reconnecting"
  204. }
  205. set new_sync_count [status $R($master_id) sync_full]
  206. assert {$sync_count == $new_sync_count}
  207. # However if the slave started with the full state of the
  208. # scripting engine, we should now have the same digest.
  209. wait_for_condition 50 1000 {
  210. [$R($master_id) debug digest] == [$R($slave_id) debug digest]
  211. } else {
  212. fail "Debug digest mismatch between master and replica in post-restart handshake"
  213. }
  214. }
  215. if {$no_exit} {
  216. while 1 { puts -nonewline .; flush stdout; after 1000}
  217. }
  218. }}}}}