psync2.tcl 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. start_server {tags {"psync2"}} {
  2. start_server {} {
  3. start_server {} {
  4. start_server {} {
  5. start_server {} {
  6. set master_id 0 ; # Current master
  7. set start_time [clock seconds] ; # Test start time
  8. set counter_value 0 ; # Current value of the Redis counter "x"
  9. # Config
  10. set debug_msg 0 ; # Enable additional debug messages
  11. set no_exit 0 ; # Do not exit at end of the test
  12. set duration 20 ; # Total test seconds
  13. set genload 1 ; # Load master with writes at every cycle
  14. set genload_time 5000 ; # Writes duration time in ms
  15. set disconnect 1 ; # Break replication link between random
  16. # master and slave instances while the
  17. # master is loaded with writes.
  18. set disconnect_period 1000 ; # Disconnect repl link every N ms.
  19. for {set j 0} {$j < 5} {incr j} {
  20. set R($j) [srv [expr 0-$j] client]
  21. set R_host($j) [srv [expr 0-$j] host]
  22. set R_port($j) [srv [expr 0-$j] port]
  23. if {$debug_msg} {puts "Log file: [srv [expr 0-$j] stdout]"}
  24. }
  25. set cycle 1
  26. while {([clock seconds]-$start_time) < $duration} {
  27. test "PSYNC2: --- CYCLE $cycle ---" {}
  28. incr cycle
  29. # Create a random replication layout.
  30. # Start with switching master (this simulates a failover).
  31. # 1) Select the new master.
  32. set master_id [randomInt 5]
  33. set used [list $master_id]
  34. test "PSYNC2: \[NEW LAYOUT\] Set #$master_id as master" {
  35. $R($master_id) slaveof no one
  36. if {$counter_value == 0} {
  37. $R($master_id) set x $counter_value
  38. }
  39. }
  40. # 2) Attach all the slaves to a random instance
  41. while {[llength $used] != 5} {
  42. while 1 {
  43. set slave_id [randomInt 5]
  44. if {[lsearch -exact $used $slave_id] == -1} break
  45. }
  46. set rand [randomInt [llength $used]]
  47. set mid [lindex $used $rand]
  48. set master_host $R_host($mid)
  49. set master_port $R_port($mid)
  50. test "PSYNC2: Set #$slave_id to replicate from #$mid" {
  51. $R($slave_id) slaveof $master_host $master_port
  52. }
  53. lappend used $slave_id
  54. }
  55. # 3) Increment the counter and wait for all the instances
  56. # to converge.
  57. test "PSYNC2: cluster is consistent after failover" {
  58. $R($master_id) incr x; incr counter_value
  59. for {set j 0} {$j < 5} {incr j} {
  60. wait_for_condition 50 1000 {
  61. [$R($j) get x] == $counter_value
  62. } else {
  63. fail "Instance #$j x variable is inconsistent"
  64. }
  65. }
  66. }
  67. # 4) Generate load while breaking the connection of random
  68. # slave-master pairs.
  69. test "PSYNC2: generate load while killing replication links" {
  70. set t [clock milliseconds]
  71. set next_break [expr {$t+$disconnect_period}]
  72. while {[clock milliseconds]-$t < $genload_time} {
  73. if {$genload} {
  74. $R($master_id) incr x; incr counter_value
  75. }
  76. if {[clock milliseconds] == $next_break} {
  77. set next_break \
  78. [expr {[clock milliseconds]+$disconnect_period}]
  79. set slave_id [randomInt 5]
  80. if {$disconnect} {
  81. $R($slave_id) client kill type master
  82. if {$debug_msg} {
  83. puts "+++ Breaking link for replica #$slave_id"
  84. }
  85. }
  86. }
  87. }
  88. }
  89. # 5) Increment the counter and wait for all the instances
  90. set x [$R($master_id) get x]
  91. test "PSYNC2: cluster is consistent after load (x = $x)" {
  92. for {set j 0} {$j < 5} {incr j} {
  93. wait_for_condition 50 1000 {
  94. [$R($j) get x] == $counter_value
  95. } else {
  96. fail "Instance #$j x variable is inconsistent"
  97. }
  98. }
  99. }
  100. # wait for all the slaves to be in sync with the master
  101. set master_ofs [status $R($master_id) master_repl_offset]
  102. wait_for_condition 500 100 {
  103. $master_ofs == [status $R(0) master_repl_offset] &&
  104. $master_ofs == [status $R(1) master_repl_offset] &&
  105. $master_ofs == [status $R(2) master_repl_offset] &&
  106. $master_ofs == [status $R(3) master_repl_offset] &&
  107. $master_ofs == [status $R(4) master_repl_offset]
  108. } else {
  109. if {$debug_msg} {
  110. for {set j 0} {$j < 5} {incr j} {
  111. puts "$j: sync_full: [status $R($j) sync_full]"
  112. puts "$j: id1 : [status $R($j) master_replid]:[status $R($j) master_repl_offset]"
  113. puts "$j: id2 : [status $R($j) master_replid2]:[status $R($j) second_repl_offset]"
  114. puts "$j: backlog : firstbyte=[status $R($j) repl_backlog_first_byte_offset] len=[status $R($j) repl_backlog_histlen]"
  115. puts "---"
  116. }
  117. }
  118. fail "Slaves are not in sync with the master after too long time."
  119. }
  120. # Put down the old master so that it cannot generate more
  121. # replication stream, this way in the next master switch, the time at
  122. # which we move slaves away is not important, each will have full
  123. # history (otherwise PINGs will make certain slaves have more history),
  124. # and sometimes a full resync will be needed.
  125. $R($master_id) slaveof 127.0.0.1 0 ;# We use port zero to make it fail.
  126. if {$debug_msg} {
  127. for {set j 0} {$j < 5} {incr j} {
  128. puts "$j: sync_full: [status $R($j) sync_full]"
  129. puts "$j: id1 : [status $R($j) master_replid]:[status $R($j) master_repl_offset]"
  130. puts "$j: id2 : [status $R($j) master_replid2]:[status $R($j) second_repl_offset]"
  131. puts "$j: backlog : firstbyte=[status $R($j) repl_backlog_first_byte_offset] len=[status $R($j) repl_backlog_histlen]"
  132. puts "---"
  133. }
  134. }
  135. test "PSYNC2: total sum of full synchronizations is exactly 4" {
  136. set sum 0
  137. for {set j 0} {$j < 5} {incr j} {
  138. incr sum [status $R($j) sync_full]
  139. }
  140. assert {$sum == 4}
  141. }
  142. # Limit anyway the maximum number of cycles. This is useful when the
  143. # test is skipped via --only option of the test suite. In that case
  144. # we don't want to see many seconds of this test being just skipped.
  145. if {$cycle > 50} break
  146. }
  147. test "PSYNC2: Bring the master back again for next test" {
  148. $R($master_id) slaveof no one
  149. set master_host $R_host($master_id)
  150. set master_port $R_port($master_id)
  151. for {set j 0} {$j < 5} {incr j} {
  152. if {$j == $master_id} continue
  153. $R($j) slaveof $master_host $master_port
  154. }
  155. # Wait for slaves to sync
  156. wait_for_condition 50 1000 {
  157. [status $R($master_id) connected_slaves] == 4
  158. } else {
  159. fail "Replica not reconnecting"
  160. }
  161. }
  162. test "PSYNC2: Partial resync after restart using RDB aux fields" {
  163. # Pick a random slave
  164. set slave_id [expr {($master_id+1)%5}]
  165. set sync_count [status $R($master_id) sync_full]
  166. set sync_partial [status $R($master_id) sync_partial_ok]
  167. catch {
  168. $R($slave_id) config rewrite
  169. $R($slave_id) debug restart
  170. }
  171. # note: just waiting for connected_slaves==4 has a race condition since
  172. # we might do the check before the master realized that the slave disconnected
  173. wait_for_condition 50 1000 {
  174. [status $R($master_id) sync_partial_ok] == $sync_partial + 1
  175. } else {
  176. fail "Replica not reconnecting"
  177. }
  178. set new_sync_count [status $R($master_id) sync_full]
  179. assert {$sync_count == $new_sync_count}
  180. }
  181. test "PSYNC2: Replica RDB restart with EVALSHA in backlog issue #4483" {
  182. # Pick a random slave
  183. set slave_id [expr {($master_id+1)%5}]
  184. set sync_count [status $R($master_id) sync_full]
  185. # Make sure to replicate the first EVAL while the salve is online
  186. # so that it's part of the scripts the master believes it's safe
  187. # to propagate as EVALSHA.
  188. $R($master_id) EVAL {return redis.call("incr","__mycounter")} 0
  189. $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
  190. # Wait for the two to sync
  191. wait_for_condition 50 1000 {
  192. [$R($master_id) debug digest] == [$R($slave_id) debug digest]
  193. } else {
  194. fail "Replica not reconnecting"
  195. }
  196. # Prevent the slave from receiving master updates, and at
  197. # the same time send a new script several times to the
  198. # master, so that we'll end with EVALSHA into the backlog.
  199. $R($slave_id) slaveof 127.0.0.1 0
  200. $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
  201. $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
  202. $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
  203. catch {
  204. $R($slave_id) config rewrite
  205. $R($slave_id) debug restart
  206. }
  207. # Reconfigure the slave correctly again, when it's back online.
  208. set retry 50
  209. while {$retry} {
  210. if {[catch {
  211. $R($slave_id) slaveof $master_host $master_port
  212. }]} {
  213. after 1000
  214. } else {
  215. break
  216. }
  217. incr retry -1
  218. }
  219. # The master should be back at 4 slaves eventually
  220. wait_for_condition 50 1000 {
  221. [status $R($master_id) connected_slaves] == 4
  222. } else {
  223. fail "Replica not reconnecting"
  224. }
  225. set new_sync_count [status $R($master_id) sync_full]
  226. assert {$sync_count == $new_sync_count}
  227. # However if the slave started with the full state of the
  228. # scripting engine, we should now have the same digest.
  229. wait_for_condition 50 1000 {
  230. [$R($master_id) debug digest] == [$R($slave_id) debug digest]
  231. } else {
  232. fail "Debug digest mismatch between master and replica in post-restart handshake"
  233. }
  234. }
  235. if {$no_exit} {
  236. while 1 { puts -nonewline .; flush stdout; after 1000}
  237. }
  238. }}}}}