258
265
self.assertEqual(result, [['t']])
260
267
def is_master(self, postgres_unit, dbname=None):
261
is_master = self.sql(
262
'SELECT NOT pg_is_in_recovery()', postgres_unit,
264
return (is_master == 't')
268
is_master = self.sql(
269
'SELECT NOT pg_is_in_recovery()',
270
postgres_unit, dbname=dbname)[0][0]
271
return (is_master == 't')
266
273
def test_failover(self):
267
274
"""Set up a multi-unit service and perform failovers."""
268
self.juju.deploy(TEST_CHARM, 'postgresql', num_units=4)
275
self.juju.deploy(TEST_CHARM, 'postgresql', num_units=3)
269
276
self.juju.deploy(PSQL_CHARM, 'psql')
270
self.juju.do(['add-relation', 'postgresql:db-admin', 'psql:db-admin'])
277
self.juju.do(['add-relation', 'postgresql:db', 'psql:db'])
271
278
self.juju.wait_until_ready()
280
# On a freshly setup service, lowest numbered unit is always the
273
282
units = unit_sorted(
274
283
self.juju.status['services']['postgresql']['units'].keys())
275
master_unit, standby_unit_1, standby_unit_2, standby_unit_3 = units
277
# Confirm units agree on their roles. On a freshly setup
278
# service, lowest numbered unit is always the master.
279
self.assertIs(True, self.is_master(master_unit, 'postgres'))
280
self.assertIs(False, self.is_master(standby_unit_1, 'postgres'))
281
self.assertIs(False, self.is_master(standby_unit_2, 'postgres'))
282
self.assertIs(False, self.is_master(standby_unit_3, 'postgres'))
284
self.sql('CREATE TABLE Token (x int)', master_unit, dbname='postgres')
284
master_unit, standby_unit_1, standby_unit_2 = units
286
self.assertIs(True, self.is_master(master_unit))
287
self.assertIs(False, self.is_master(standby_unit_1))
288
self.assertIs(False, self.is_master(standby_unit_2))
290
self.sql('CREATE TABLE Token (x int)', master_unit)
292
# Some simple helper to send data via the master and check if it
293
# was replicated to the hot standbys.
288
296
def send_token(unit):
291
"INSERT INTO Token VALUES (%d)" % _counter[0],
292
unit, dbname='postgres')
298
self.sql("INSERT INTO Token VALUES (%d)" % _counter[0], unit)
294
300
def token_received(unit):
296
"SELECT TRUE FROM Token WHERE x=%d" % _counter[0],
297
unit, dbname='postgres')
302
"SELECT TRUE FROM Token WHERE x=%d" % _counter[0], unit)
298
303
return (r == [['t']])
300
305
# Confirm that replication is actually happening.
301
306
send_token(master_unit)
302
307
self.assertIs(True, token_received(standby_unit_1))
303
308
self.assertIs(True, token_received(standby_unit_2))
304
self.assertIs(True, token_received(standby_unit_3))
306
# When we failover, the unit most in sync with the old master is
307
# elected the new master. Disable replication on standby_unit_1
308
# and standby_unit_3 to ensure that standby_unit_2 is the best
309
# candidate for master.
311
'SELECT pg_xlog_replay_pause()', standby_unit_1, dbname='postgres')
313
'SELECT pg_xlog_replay_pause()', standby_unit_3, dbname='postgres')
315
send_token(master_unit)
316
self.assertIs(False, token_received(standby_unit_1))
317
self.assertIs(True, token_received(standby_unit_2))
318
self.assertIs(False, token_received(standby_unit_3))
320
310
# Remove the master unit.
321
311
self.juju.do(['remove-unit', master_unit])
322
312
self.juju.wait_until_ready()
324
# Confirm the failover worked as expected.
325
self.assertIs(False, self.is_master(standby_unit_1, 'postgres'))
326
self.assertIs(True, self.is_master(standby_unit_2, 'postgres'))
327
self.assertIs(False, self.is_master(standby_unit_3, 'postgres'))
329
master_unit = standby_unit_2
331
# Replication was not reenabled by the failover.
332
send_token(master_unit)
333
self.assertIs(False, token_received(standby_unit_1))
334
self.assertIs(False, token_received(standby_unit_3))
336
'select pg_xlog_replay_resume()',
337
standby_unit_1, dbname='postgres')
339
'select pg_xlog_replay_resume()',
340
standby_unit_3, dbname='postgres')
342
# Now replication is happening again
343
self.assertIs(True, token_received(standby_unit_1))
344
self.assertIs(True, token_received(standby_unit_3))
346
# Remove the master again
347
self.juju.do(['remove-unit', master_unit])
348
self.juju.wait_until_ready()
350
# Now we have a new master, and we can't be sure which of the
351
# remaining two units was elected because we don't know if one
352
# happened to be more in sync than the other.
353
standby_unit_1_is_master = is_master(standby_unit_1, 'postgres')
354
standby_unit_3_is_master = is_master(standby_unit_3, 'postgres')
355
self.assertNotEqual(standby_unit_1_is_master, standby_unit_3_is_master)
314
# When we failover, the unit that has received the most WAL
315
# information from the old master (most in sync) is elected the
317
standby_unit_1_is_master = self.is_master(standby_unit_1)
318
standby_unit_2_is_master = self.is_master(standby_unit_2)
320
standby_unit_1_is_master, standby_unit_2_is_master)
357
322
if standby_unit_1_is_master:
358
323
master_unit = standby_unit_1
359
standby_unit = standby_unit_3
324
standby_unit = standby_unit_2
361
master_unit = standby_unit_3
326
master_unit = standby_unit_2
362
327
standby_unit = standby_unit_1
364
# Replication is already flowing.
329
# Confirm replication is still working.
365
330
send_token(master_unit)
366
331
self.assertIs(True, token_received(standby_unit))
368
# When we remove the last master, we end up with a single
369
# functioning standalone database.
370
self.juju.do(['remove-unit', master_unit])
371
self.juju.wait_until_ready()
373
self.is_master(standby_unit, 'postgres')
375
# TODO: We need to extend the postgresql-psql charm to allow us
376
# to inspect the status attribute on the relation. It should no
377
# longer be 'master', but instead 'standalone'.
333
# Remove the master again, leaving a single unit.
334
self.juju.do(['remove-unit', master_unit])
335
self.juju.wait_until_ready()
337
# Last unit is a working, standalone database.
338
self.is_master(standby_unit)
339
send_token(standby_unit)
341
# We can tell it is correctly reporting that it is standalone by
342
# seeing if the -master and -hot-standby scripts no longer exist
345
subprocess.CalledProcessError,
346
self.sql, 'SELECT TRUE', 'master')
348
subprocess.CalledProcessError,
349
self.sql, 'SELECT TRUE', 'hot standby')
351
def test_failover_election(self):
352
"""Ensure master elected in a failover is the best choice"""
353
self.juju.deploy(TEST_CHARM, 'postgresql', num_units=3)
354
self.juju.deploy(PSQL_CHARM, 'psql')
355
self.juju.do(['add-relation', 'postgresql:db-admin', 'psql:db-admin'])
356
self.juju.wait_until_ready()
358
# On a freshly setup service, lowest numbered unit is always the
361
self.juju.status['services']['postgresql']['units'].keys())
362
master_unit, standby_unit_1, standby_unit_2 = units
364
# Shutdown PostgreSQL on standby_unit_1 and ensure
365
# standby_unit_2 will have received more WAL information from
367
self.pg_ctlcluster(standby_unit_1, stop)
368
self.sql("SELECT pg_switch_xlog()", master_unit, dbname='postgres')
370
# Destroy the master database, just like this was a real
372
cmd = ['juju', 'ssh', unit,
373
# Due to Bug #1191079, we need to send the whole remote command
374
# as a single argument.
375
'sudo pg_dropcluster --stop 9.1 main']
378
# Restart standby_unit_1 now the master unit is dead and it has
379
# no way or resyncing.
380
self.pg_ctlcluster(standby_unit_1, start)
382
# Failover. Note that this also tests we can remove a unit that
383
# does not have a working database.
384
self.juju.do(['remove-unit', master_unit])
385
self.juju.wait_until_ready()
387
# Ensure the election went as predicted.
388
self.assertIs(False, self.is_master(standby_unit_1))
389
self.assertIs(True, self.is_master(standby_unit_2))
380
392
def unit_sorted(units):