psync2.tcl 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. start_server {tags {"psync2"}} {
  2. start_server {} {
  3. start_server {} {
  4. start_server {} {
  5. start_server {} {
  6. set master_id 0 ; # Current master
  7. set start_time [clock seconds] ; # Test start time
  8. set counter_value 0 ; # Current value of the Redis counter "x"
  9. # Config
  10. set debug_msg 0 ; # Enable additional debug messages
  11. set no_exit 0 ; # Do not exit at end of the test
  12. set duration 20 ; # Total test seconds
  13. set genload 1 ; # Load master with writes at every cycle
  14. set genload_time 5000 ; # Writes duration time in ms
  15. set disconnect 1 ; # Break replication link between random
  16. # master and slave instances while the
  17. # master is loaded with writes.
  18. set disconnect_period 1000 ; # Disconnect repl link every N ms.
  19. for {set j 0} {$j < 5} {incr j} {
  20. set R($j) [srv [expr 0-$j] client]
  21. set R_host($j) [srv [expr 0-$j] host]
  22. set R_port($j) [srv [expr 0-$j] port]
  23. if {$debug_msg} {puts "Log file: [srv [expr 0-$j] stdout]"}
  24. }
  25. set cycle 1
  26. while {([clock seconds]-$start_time) < $duration} {
  27. test "PSYNC2: --- CYCLE $cycle ---" {
  28. incr cycle
  29. }
  30. # Create a random replication layout.
  31. # Start with switching master (this simulates a failover).
  32. # 1) Select the new master.
  33. set master_id [randomInt 5]
  34. set used [list $master_id]
  35. test "PSYNC2: \[NEW LAYOUT\] Set #$master_id as master" {
  36. $R($master_id) slaveof no one
  37. if {$counter_value == 0} {
  38. $R($master_id) set x $counter_value
  39. }
  40. }
  41. # 2) Attach all the slaves to a random instance
  42. while {[llength $used] != 5} {
  43. while 1 {
  44. set slave_id [randomInt 5]
  45. if {[lsearch -exact $used $slave_id] == -1} break
  46. }
  47. set rand [randomInt [llength $used]]
  48. set mid [lindex $used $rand]
  49. set master_host $R_host($mid)
  50. set master_port $R_port($mid)
  51. test "PSYNC2: Set #$slave_id to replicate from #$mid" {
  52. $R($slave_id) slaveof $master_host $master_port
  53. }
  54. lappend used $slave_id
  55. }
  56. # 3) Increment the counter and wait for all the instances
  57. # to converge.
  58. test "PSYNC2: cluster is consistent after failover" {
  59. $R($master_id) incr x; incr counter_value
  60. for {set j 0} {$j < 5} {incr j} {
  61. wait_for_condition 50 1000 {
  62. [$R($j) get x] == $counter_value
  63. } else {
  64. fail "Instance #$j x variable is inconsistent"
  65. }
  66. }
  67. }
  68. # 4) Generate load while breaking the connection of random
  69. # slave-master pairs.
  70. test "PSYNC2: generate load while killing replication links" {
  71. set t [clock milliseconds]
  72. set next_break [expr {$t+$disconnect_period}]
  73. while {[clock milliseconds]-$t < $genload_time} {
  74. if {$genload} {
  75. $R($master_id) incr x; incr counter_value
  76. }
  77. if {[clock milliseconds] == $next_break} {
  78. set next_break \
  79. [expr {[clock milliseconds]+$disconnect_period}]
  80. set slave_id [randomInt 5]
  81. if {$disconnect} {
  82. $R($slave_id) client kill type master
  83. if {$debug_msg} {
  84. puts "+++ Breaking link for slave #$slave_id"
  85. }
  86. }
  87. }
  88. }
  89. }
  90. # 5) Increment the counter and wait for all the instances
  91. set x [$R($master_id) get x]
  92. test "PSYNC2: cluster is consistent after load (x = $x)" {
  93. for {set j 0} {$j < 5} {incr j} {
  94. wait_for_condition 50 1000 {
  95. [$R($j) get x] == $counter_value
  96. } else {
  97. fail "Instance #$j x variable is inconsistent"
  98. }
  99. }
  100. }
  101. # Put down the old master so that it cannot generate more
  102. # replication stream, this way in the next master switch, the time at
  103. # which we move slaves away is not important, each will have full
  104. # history (otherwise PINGs will make certain slaves have more history),
  105. # and sometimes a full resync will be needed.
  106. $R($master_id) slaveof 127.0.0.1 0 ;# We use port zero to make it fail.
  107. if {$debug_msg} {
  108. for {set j 0} {$j < 5} {incr j} {
  109. puts "$j: sync_full: [status $R($j) sync_full]"
  110. puts "$j: id1 : [status $R($j) master_replid]:[status $R($j) master_repl_offset]"
  111. puts "$j: id2 : [status $R($j) master_replid2]:[status $R($j) second_repl_offset]"
  112. puts "$j: backlog : firstbyte=[status $R($j) repl_backlog_first_byte_offset] len=[status $R($j) repl_backlog_histlen]"
  113. puts "---"
  114. }
  115. }
  116. test "PSYNC2: total sum of full synchronizations is exactly 4" {
  117. set sum 0
  118. for {set j 0} {$j < 5} {incr j} {
  119. incr sum [status $R($j) sync_full]
  120. }
  121. assert {$sum == 4}
  122. }
  123. }
  124. test "PSYNC2: Bring the master back again for next test" {
  125. $R($master_id) slaveof no one
  126. set master_host $R_host($master_id)
  127. set master_port $R_port($master_id)
  128. for {set j 0} {$j < 5} {incr j} {
  129. if {$j == $master_id} continue
  130. $R($j) slaveof $master_host $master_port
  131. }
  132. # Wait for slaves to sync
  133. wait_for_condition 50 1000 {
  134. [status $R($master_id) connected_slaves] == 4
  135. } else {
  136. fail "Slave not reconnecting"
  137. }
  138. }
  139. test "PSYNC2: Partial resync after restart using RDB aux fields" {
  140. # Pick a random slave
  141. set slave_id [expr {($master_id+1)%5}]
  142. set sync_count [status $R($master_id) sync_full]
  143. catch {
  144. $R($slave_id) config rewrite
  145. $R($slave_id) debug restart
  146. }
  147. wait_for_condition 50 1000 {
  148. [status $R($master_id) connected_slaves] == 4
  149. } else {
  150. fail "Slave not reconnecting"
  151. }
  152. set new_sync_count [status $R($master_id) sync_full]
  153. assert {$sync_count == $new_sync_count}
  154. }
  155. test "PSYNC2: Slave RDB restart with EVALSHA in backlog issue #4483" {
  156. # Pick a random slave
  157. set slave_id [expr {($master_id+1)%5}]
  158. set sync_count [status $R($master_id) sync_full]
  159. # Make sure to replicate the first EVAL while the salve is online
  160. # so that it's part of the scripts the master believes it's safe
  161. # to propagate as EVALSHA.
  162. $R($master_id) EVAL {return redis.call("incr","__mycounter")} 0
  163. $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
  164. # Wait for the two to sync
  165. wait_for_condition 50 1000 {
  166. [$R($master_id) debug digest] == [$R($slave_id) debug digest]
  167. } else {
  168. fail "Slave not reconnecting"
  169. }
  170. # Prevent the slave from receiving master updates, and at
  171. # the same time send a new script several times to the
  172. # master, so that we'll end with EVALSHA into the backlog.
  173. $R($slave_id) slaveof 127.0.0.1 0
  174. $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
  175. $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
  176. $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0
  177. catch {
  178. $R($slave_id) config rewrite
  179. $R($slave_id) debug restart
  180. }
  181. # Reconfigure the slave correctly again, when it's back online.
  182. set retry 50
  183. while {$retry} {
  184. if {[catch {
  185. $R($slave_id) slaveof $master_host $master_port
  186. }]} {
  187. after 1000
  188. } else {
  189. break
  190. }
  191. incr retry -1
  192. }
  193. # The master should be back at 4 slaves eventually
  194. wait_for_condition 50 1000 {
  195. [status $R($master_id) connected_slaves] == 4
  196. } else {
  197. fail "Slave not reconnecting"
  198. }
  199. set new_sync_count [status $R($master_id) sync_full]
  200. assert {$sync_count == $new_sync_count}
  201. # However if the slave started with the full state of the
  202. # scripting engine, we should now have the same digest.
  203. wait_for_condition 50 1000 {
  204. [$R($master_id) debug digest] == [$R($slave_id) debug digest]
  205. } else {
  206. fail "Debug digest mismatch between master and slave in post-restart handshake"
  207. }
  208. }
  209. if {$no_exit} {
  210. while 1 { puts -nonewline .; flush stdout; after 1000}
  211. }
  212. }}}}}