2
0

instances.tcl 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
  1. # Multi-instance test framework.
  2. # This is used in order to test Sentinel and Redis Cluster, and provides
  3. # basic capabilities for spawning and handling N parallel Redis / Sentinel
  4. # instances.
  5. #
  6. # Copyright (C) 2014 Salvatore Sanfilippo antirez@gmail.com
  7. # This software is released under the BSD License. See the COPYING file for
  8. # more information.
  9. package require Tcl 8.5
  10. set tcl_precision 17
  11. source ../support/redis.tcl
  12. source ../support/util.tcl
  13. source ../support/server.tcl
  14. source ../support/test.tcl
  15. set ::verbose 0
  16. set ::valgrind 0
  17. set ::tls 0
  18. set ::pause_on_error 0
  19. set ::simulate_error 0
  20. set ::failed 0
  21. set ::sentinel_instances {}
  22. set ::redis_instances {}
  23. set ::sentinel_base_port 20000
  24. set ::redis_base_port 30000
  25. set ::redis_port_count 1024
  26. set ::pids {} ; # We kill everything at exit
  27. set ::dirs {} ; # We remove all the temp dirs at exit
  28. set ::run_matching {} ; # If non empty, only tests matching pattern are run.
  29. if {[catch {cd tmp}]} {
  30. puts "tmp directory not found."
  31. puts "Please run this test from the Redis source root."
  32. exit 1
  33. }
  34. # Execute the specified instance of the server specified by 'type', using
  35. # the provided configuration file. Returns the PID of the process.
  36. proc exec_instance {type cfgfile} {
  37. if {$type eq "redis"} {
  38. set prgname redis-server
  39. } elseif {$type eq "sentinel"} {
  40. set prgname redis-sentinel
  41. } else {
  42. error "Unknown instance type."
  43. }
  44. if {$::valgrind} {
  45. set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
  46. } else {
  47. set pid [exec ../../../src/${prgname} $cfgfile &]
  48. }
  49. return $pid
  50. }
  51. # Spawn a redis or sentinel instance, depending on 'type'.
  52. proc spawn_instance {type base_port count {conf {}}} {
  53. for {set j 0} {$j < $count} {incr j} {
  54. set port [find_available_port $base_port $::redis_port_count]
  55. incr base_port
  56. puts "Starting $type #$j at port $port"
  57. # Create a directory for this instance.
  58. set dirname "${type}_${j}"
  59. lappend ::dirs $dirname
  60. catch {exec rm -rf $dirname}
  61. file mkdir $dirname
  62. # Write the instance config file.
  63. set cfgfile [file join $dirname $type.conf]
  64. set cfg [open $cfgfile w]
  65. if {$::tls} {
  66. puts $cfg "tls-port $port"
  67. puts $cfg "tls-replication yes"
  68. puts $cfg "tls-cluster yes"
  69. puts $cfg "port 0"
  70. puts $cfg [format "tls-cert-file %s/../../tls/redis.crt" [pwd]]
  71. puts $cfg [format "tls-key-file %s/../../tls/redis.key" [pwd]]
  72. puts $cfg [format "tls-dh-params-file %s/../../tls/redis.dh" [pwd]]
  73. puts $cfg [format "tls-ca-cert-file %s/../../tls/ca.crt" [pwd]]
  74. puts $cfg "loglevel debug"
  75. } else {
  76. puts $cfg "port $port"
  77. }
  78. puts $cfg "dir ./$dirname"
  79. puts $cfg "logfile log.txt"
  80. # Add additional config files
  81. foreach directive $conf {
  82. puts $cfg $directive
  83. }
  84. close $cfg
  85. # Finally exec it and remember the pid for later cleanup.
  86. set pid [exec_instance $type $cfgfile]
  87. lappend ::pids $pid
  88. # Check availability
  89. if {[server_is_up 127.0.0.1 $port 100] == 0} {
  90. abort_sentinel_test "Problems starting $type #$j: ping timeout"
  91. }
  92. # Push the instance into the right list
  93. set link [redis 127.0.0.1 $port 0 $::tls]
  94. $link reconnect 1
  95. lappend ::${type}_instances [list \
  96. pid $pid \
  97. host 127.0.0.1 \
  98. port $port \
  99. link $link \
  100. ]
  101. }
  102. }
  103. proc log_crashes {} {
  104. set start_pattern {*REDIS BUG REPORT START*}
  105. set logs [glob */log.txt]
  106. foreach log $logs {
  107. set fd [open $log]
  108. set found 0
  109. while {[gets $fd line] >= 0} {
  110. if {[string match $start_pattern $line]} {
  111. puts "\n*** Crash report found in $log ***"
  112. set found 1
  113. }
  114. if {$found} {puts $line}
  115. }
  116. }
  117. }
  118. proc cleanup {} {
  119. puts "Cleaning up..."
  120. log_crashes
  121. foreach pid $::pids {
  122. catch {exec kill -9 $pid}
  123. }
  124. foreach dir $::dirs {
  125. catch {exec rm -rf $dir}
  126. }
  127. }
  128. proc abort_sentinel_test msg {
  129. incr ::failed
  130. puts "WARNING: Aborting the test."
  131. puts ">>>>>>>> $msg"
  132. if {$::pause_on_error} pause_on_error
  133. cleanup
  134. exit 1
  135. }
  136. proc parse_options {} {
  137. for {set j 0} {$j < [llength $::argv]} {incr j} {
  138. set opt [lindex $::argv $j]
  139. set val [lindex $::argv [expr $j+1]]
  140. if {$opt eq "--single"} {
  141. incr j
  142. set ::run_matching "*${val}*"
  143. } elseif {$opt eq "--pause-on-error"} {
  144. set ::pause_on_error 1
  145. } elseif {$opt eq "--fail"} {
  146. set ::simulate_error 1
  147. } elseif {$opt eq {--valgrind}} {
  148. set ::valgrind 1
  149. } elseif {$opt eq {--tls}} {
  150. package require tls 1.6
  151. ::tls::init \
  152. -cafile "$::tlsdir/ca.crt" \
  153. -certfile "$::tlsdir/redis.crt" \
  154. -keyfile "$::tlsdir/redis.key"
  155. set ::tls 1
  156. } elseif {$opt eq "--help"} {
  157. puts "Hello, I'm sentinel.tcl and I run Sentinel unit tests."
  158. puts "\nOptions:"
  159. puts "--single <pattern> Only runs tests specified by pattern."
  160. puts "--pause-on-error Pause for manual inspection on error."
  161. puts "--fail Simulate a test failure."
  162. puts "--valgrind Run with valgrind."
  163. puts "--help Shows this help."
  164. exit 0
  165. } else {
  166. puts "Unknown option $opt"
  167. exit 1
  168. }
  169. }
  170. }
  171. # If --pause-on-error option was passed at startup this function is called
  172. # on error in order to give the developer a chance to understand more about
  173. # the error condition while the instances are still running.
  174. proc pause_on_error {} {
  175. puts ""
  176. puts [colorstr yellow "*** Please inspect the error now ***"]
  177. puts "\nType \"continue\" to resume the test, \"help\" for help screen.\n"
  178. while 1 {
  179. puts -nonewline "> "
  180. flush stdout
  181. set line [gets stdin]
  182. set argv [split $line " "]
  183. set cmd [lindex $argv 0]
  184. if {$cmd eq {continue}} {
  185. break
  186. } elseif {$cmd eq {show-redis-logs}} {
  187. set count 10
  188. if {[lindex $argv 1] ne {}} {set count [lindex $argv 1]}
  189. foreach_redis_id id {
  190. puts "=== REDIS $id ===="
  191. puts [exec tail -$count redis_$id/log.txt]
  192. puts "---------------------\n"
  193. }
  194. } elseif {$cmd eq {show-sentinel-logs}} {
  195. set count 10
  196. if {[lindex $argv 1] ne {}} {set count [lindex $argv 1]}
  197. foreach_sentinel_id id {
  198. puts "=== SENTINEL $id ===="
  199. puts [exec tail -$count sentinel_$id/log.txt]
  200. puts "---------------------\n"
  201. }
  202. } elseif {$cmd eq {ls}} {
  203. foreach_redis_id id {
  204. puts -nonewline "Redis $id"
  205. set errcode [catch {
  206. set str {}
  207. append str "@[RI $id tcp_port]: "
  208. append str "[RI $id role] "
  209. if {[RI $id role] eq {slave}} {
  210. append str "[RI $id master_host]:[RI $id master_port]"
  211. }
  212. set str
  213. } retval]
  214. if {$errcode} {
  215. puts " -- $retval"
  216. } else {
  217. puts $retval
  218. }
  219. }
  220. foreach_sentinel_id id {
  221. puts -nonewline "Sentinel $id"
  222. set errcode [catch {
  223. set str {}
  224. append str "@[SI $id tcp_port]: "
  225. append str "[join [S $id sentinel get-master-addr-by-name mymaster]]"
  226. set str
  227. } retval]
  228. if {$errcode} {
  229. puts " -- $retval"
  230. } else {
  231. puts $retval
  232. }
  233. }
  234. } elseif {$cmd eq {help}} {
  235. puts "ls List Sentinel and Redis instances."
  236. puts "show-sentinel-logs \[N\] Show latest N lines of logs."
  237. puts "show-redis-logs \[N\] Show latest N lines of logs."
  238. puts "S <id> cmd ... arg Call command in Sentinel <id>."
  239. puts "R <id> cmd ... arg Call command in Redis <id>."
  240. puts "SI <id> <field> Show Sentinel <id> INFO <field>."
  241. puts "RI <id> <field> Show Sentinel <id> INFO <field>."
  242. puts "continue Resume test."
  243. } else {
  244. set errcode [catch {eval $line} retval]
  245. if {$retval ne {}} {puts "$retval"}
  246. }
  247. }
  248. }
  249. # We redefine 'test' as for Sentinel we don't use the server-client
  250. # architecture for the test, everything is sequential.
  251. proc test {descr code} {
  252. set ts [clock format [clock seconds] -format %H:%M:%S]
  253. puts -nonewline "$ts> $descr: "
  254. flush stdout
  255. if {[catch {set retval [uplevel 1 $code]} error]} {
  256. incr ::failed
  257. if {[string match "assertion:*" $error]} {
  258. set msg [string range $error 10 end]
  259. puts [colorstr red $msg]
  260. if {$::pause_on_error} pause_on_error
  261. puts "(Jumping to next unit after error)"
  262. return -code continue
  263. } else {
  264. # Re-raise, let handler up the stack take care of this.
  265. error $error $::errorInfo
  266. }
  267. } else {
  268. puts [colorstr green OK]
  269. }
  270. }
  271. # Check memory leaks when running on OSX using the "leaks" utility.
  272. proc check_leaks instance_types {
  273. if {[string match {*Darwin*} [exec uname -a]]} {
  274. puts -nonewline "Testing for memory leaks..."; flush stdout
  275. foreach type $instance_types {
  276. foreach_instance_id [set ::${type}_instances] id {
  277. if {[instance_is_killed $type $id]} continue
  278. set pid [get_instance_attrib $type $id pid]
  279. set output {0 leaks}
  280. catch {exec leaks $pid} output
  281. if {[string match {*process does not exist*} $output] ||
  282. [string match {*cannot examine*} $output]} {
  283. # In a few tests we kill the server process.
  284. set output "0 leaks"
  285. } else {
  286. puts -nonewline "$type/$pid "
  287. flush stdout
  288. }
  289. if {![string match {*0 leaks*} $output]} {
  290. puts [colorstr red "=== MEMORY LEAK DETECTED ==="]
  291. puts "Instance type $type, ID $id:"
  292. puts $output
  293. puts "==="
  294. incr ::failed
  295. }
  296. }
  297. }
  298. puts ""
  299. }
  300. }
  301. # Execute all the units inside the 'tests' directory.
  302. proc run_tests {} {
  303. set tests [lsort [glob ../tests/*]]
  304. foreach test $tests {
  305. if {$::run_matching ne {} && [string match $::run_matching $test] == 0} {
  306. continue
  307. }
  308. if {[file isdirectory $test]} continue
  309. puts [colorstr yellow "Testing unit: [lindex [file split $test] end]"]
  310. source $test
  311. check_leaks {redis sentinel}
  312. }
  313. }
  314. # Print a message and exists with 0 / 1 according to zero or more failures.
  315. proc end_tests {} {
  316. if {$::failed == 0} {
  317. puts "GOOD! No errors."
  318. exit 0
  319. } else {
  320. puts "WARNING $::failed test(s) failed."
  321. exit 1
  322. }
  323. }
  324. # The "S" command is used to interact with the N-th Sentinel.
  325. # The general form is:
  326. #
  327. # S <sentinel-id> command arg arg arg ...
  328. #
  329. # Example to ping the Sentinel 0 (first instance): S 0 PING
  330. proc S {n args} {
  331. set s [lindex $::sentinel_instances $n]
  332. [dict get $s link] {*}$args
  333. }
  334. # Like R but to chat with Redis instances.
  335. proc R {n args} {
  336. set r [lindex $::redis_instances $n]
  337. [dict get $r link] {*}$args
  338. }
  339. proc get_info_field {info field} {
  340. set fl [string length $field]
  341. append field :
  342. foreach line [split $info "\n"] {
  343. set line [string trim $line "\r\n "]
  344. if {[string range $line 0 $fl] eq $field} {
  345. return [string range $line [expr {$fl+1}] end]
  346. }
  347. }
  348. return {}
  349. }
  350. proc SI {n field} {
  351. get_info_field [S $n info] $field
  352. }
  353. proc RI {n field} {
  354. get_info_field [R $n info] $field
  355. }
  356. # Iterate over IDs of sentinel or redis instances.
  357. proc foreach_instance_id {instances idvar code} {
  358. upvar 1 $idvar id
  359. for {set id 0} {$id < [llength $instances]} {incr id} {
  360. set errcode [catch {uplevel 1 $code} result]
  361. if {$errcode == 1} {
  362. error $result $::errorInfo $::errorCode
  363. } elseif {$errcode == 4} {
  364. continue
  365. } elseif {$errcode == 3} {
  366. break
  367. } elseif {$errcode != 0} {
  368. return -code $errcode $result
  369. }
  370. }
  371. }
  372. proc foreach_sentinel_id {idvar code} {
  373. set errcode [catch {uplevel 1 [list foreach_instance_id $::sentinel_instances $idvar $code]} result]
  374. return -code $errcode $result
  375. }
  376. proc foreach_redis_id {idvar code} {
  377. set errcode [catch {uplevel 1 [list foreach_instance_id $::redis_instances $idvar $code]} result]
  378. return -code $errcode $result
  379. }
  380. # Get the specific attribute of the specified instance type, id.
  381. proc get_instance_attrib {type id attrib} {
  382. dict get [lindex [set ::${type}_instances] $id] $attrib
  383. }
  384. # Set the specific attribute of the specified instance type, id.
  385. proc set_instance_attrib {type id attrib newval} {
  386. set d [lindex [set ::${type}_instances] $id]
  387. dict set d $attrib $newval
  388. lset ::${type}_instances $id $d
  389. }
  390. # Create a master-slave cluster of the given number of total instances.
  391. # The first instance "0" is the master, all others are configured as
  392. # slaves.
  393. proc create_redis_master_slave_cluster n {
  394. foreach_redis_id id {
  395. if {$id == 0} {
  396. # Our master.
  397. R $id slaveof no one
  398. R $id flushall
  399. } elseif {$id < $n} {
  400. R $id slaveof [get_instance_attrib redis 0 host] \
  401. [get_instance_attrib redis 0 port]
  402. } else {
  403. # Instances not part of the cluster.
  404. R $id slaveof no one
  405. }
  406. }
  407. # Wait for all the slaves to sync.
  408. wait_for_condition 1000 50 {
  409. [RI 0 connected_slaves] == ($n-1)
  410. } else {
  411. fail "Unable to create a master-slaves cluster."
  412. }
  413. }
  414. proc get_instance_id_by_port {type port} {
  415. foreach_${type}_id id {
  416. if {[get_instance_attrib $type $id port] == $port} {
  417. return $id
  418. }
  419. }
  420. fail "Instance $type port $port not found."
  421. }
  422. # Kill an instance of the specified type/id with SIGKILL.
  423. # This function will mark the instance PID as -1 to remember that this instance
  424. # is no longer running and will remove its PID from the list of pids that
  425. # we kill at cleanup.
  426. #
  427. # The instance can be restarted with restart-instance.
  428. proc kill_instance {type id} {
  429. set pid [get_instance_attrib $type $id pid]
  430. set port [get_instance_attrib $type $id port]
  431. if {$pid == -1} {
  432. error "You tried to kill $type $id twice."
  433. }
  434. exec kill -9 $pid
  435. set_instance_attrib $type $id pid -1
  436. set_instance_attrib $type $id link you_tried_to_talk_with_killed_instance
  437. # Remove the PID from the list of pids to kill at exit.
  438. set ::pids [lsearch -all -inline -not -exact $::pids $pid]
  439. # Wait for the port it was using to be available again, so that's not
  440. # an issue to start a new server ASAP with the same port.
  441. set retry 10
  442. while {[incr retry -1]} {
  443. set port_is_free [catch {set s [socket 127.0.01 $port]}]
  444. if {$port_is_free} break
  445. catch {close $s}
  446. after 1000
  447. }
  448. if {$retry == 0} {
  449. error "Port $port does not return available after killing instance."
  450. }
  451. }
  452. # Return true of the instance of the specified type/id is killed.
  453. proc instance_is_killed {type id} {
  454. set pid [get_instance_attrib $type $id pid]
  455. expr {$pid == -1}
  456. }
  457. # Restart an instance previously killed by kill_instance
  458. proc restart_instance {type id} {
  459. set dirname "${type}_${id}"
  460. set cfgfile [file join $dirname $type.conf]
  461. set port [get_instance_attrib $type $id port]
  462. # Execute the instance with its old setup and append the new pid
  463. # file for cleanup.
  464. set pid [exec_instance $type $cfgfile]
  465. set_instance_attrib $type $id pid $pid
  466. lappend ::pids $pid
  467. # Check that the instance is running
  468. if {[server_is_up 127.0.0.1 $port 100] == 0} {
  469. abort_sentinel_test "Problems starting $type #$id: ping timeout"
  470. }
  471. # Connect with it with a fresh link
  472. set link [redis 127.0.0.1 $port 0 $::tls]
  473. $link reconnect 1
  474. set_instance_attrib $type $id link $link
  475. # Make sure the instance is not loading the dataset when this
  476. # function returns.
  477. while 1 {
  478. catch {[$link ping]} retval
  479. if {[string match {*LOADING*} $retval]} {
  480. after 100
  481. continue
  482. } else {
  483. break
  484. }
  485. }
  486. }