425
609
# generate_postgresql_hba: Creates the pg_hba.conf file
426
610
#------------------------------------------------------------------------------
427
611
def generate_postgresql_hba(postgresql_hba):
613
# Per Bug #1117542, when generating the postgresql_hba file we
614
# need to cope with private-address being either an IP address
616
def munge_address(addr):
617
# http://stackoverflow.com/q/319279/196832
619
socket.inet_aton(addr)
620
return "%s/32" % addr
622
# It's not an IP address.
428
625
relation_data = relation_get_all(relation_types=['db', 'db-admin'])
430
626
for relation in relation_data:
431
if re.search('db-admin', relation['relation-id']):
627
relation_id = relation['relation-id']
628
if relation_id.startswith('db-admin:'):
432
629
relation['user'] = 'all'
433
630
relation['database'] = 'all'
631
elif relation_id.startswith('db:'):
435
632
relation['user'] = user_name(relation['relation-id'],
436
633
relation['unit'])
437
634
relation['schema_user'] = user_name(relation['relation-id'],
438
635
relation['unit'],
440
# LP:1117542 - http://stackoverflow.com/q/319279/196832
442
socket.inet_aton(relation['private-address'])
443
relation['private-address'] = "%s/32" % relation['private-address']
445
# It's not an IP address.
639
'Unknown relation type {}'.format(repr(relation_id)))
641
relation['private-address'] = munge_address(
642
relation['private-address'])
448
644
juju_log(MSG_INFO, str(relation_data))
646
# Replication connections. Each unit needs to be able to connect to
647
# every other unit's repmgr database and the magic replication
648
# database. It also needs to be able to connect to its own repmgr
650
replication_relations = relation_get_all(
651
relation_types=replication_relation_types)
652
for relation in replication_relations:
653
remote_addr = munge_address(relation['private-address'])
654
remote_replication = {
655
'database': 'replication', 'user': 'repmgr',
656
'private-address': remote_addr,
657
'relation-id': relation['relation-id'],
658
'unit': relation['private-address'],
660
relation_data.append(remote_replication)
662
'database': 'repmgr', 'user': 'repmgr',
663
'private-address': remote_addr,
664
'relation-id': relation['relation-id'],
665
'unit': relation['private-address'],
667
relation_data.append(remote_repmgr)
668
if replication_relations:
670
'database': 'repmgr', 'user': 'repmgr',
671
'private-address': munge_address(get_unit_host()),
672
'relation-id': relation['relation-id'],
673
'unit': get_unit_host(),
675
relation_data.append(local_repmgr)
449
677
pg_hba_template = Template(
450
678
open("templates/pg_hba.conf.tmpl").read()).render(
451
access_list= relation_data)
679
access_list=relation_data)
452
680
with open(postgresql_hba, 'w') as hba_file:
453
681
hba_file.write(str(pg_hba_template))
454
# hba_conf changes do not need full db restarts
455
subprocess.call(['invoke-rc.d', 'postgresql', 'reload'])
458
686
#------------------------------------------------------------------------------
913
1178
generate_postgresql_hba(postgresql_hba)
1182
juju_log(MSG_WARNING, 'TODO> %s' % msg)
1185
def install_repmgr():
1186
'''Install the repmgr package if it isn't already.'''
1187
extra_repos = config_get('extra_archives')
1188
extra_repos_added = local_state.setdefault('extra_repos_added', set())
1191
for repo in extra_repos.split():
1192
if repo not in extra_repos_added:
1193
run("add-apt-repository --yes '{}'".format(repo))
1194
extra_repos_added.add(repo)
1197
run('apt-get update')
1199
apt_get_install('repmgr')
1200
apt_get_install('postgresql-9.1-repmgr')
1203
def ensure_local_ssh():
1204
"""Generate SSH keys for postgres user.
1206
The public key is stored in public_ssh_key on the relation.
1208
Bidirectional SSH access is required by repmgr.
1210
comment = 'repmgr key for {}'.format(os.environ['JUJU_UNIT_NAME'])
1211
if not os.path.isdir(postgres_ssh_dir):
1212
install_dir(postgres_ssh_dir, "postgres", "postgres", 0700)
1213
if not os.path.exists(postgres_ssh_private_key):
1214
run("sudo -u postgres -H ssh-keygen -q -t rsa -C '{}' -N '' "
1215
"-f '{}'".format(comment, postgres_ssh_private_key))
1216
public_key = open(postgres_ssh_public_key, 'r').read().strip()
1217
host_key = open('/etc/ssh/ssh_host_ecdsa_key.pub').read().strip()
1218
local_state['public_ssh_key'] = public_key
1219
local_state['ssh_host_key'] = host_key
1220
local_state.publish()
1223
def authorize_remote_ssh():
1224
"""Generate the SSH authorized_keys file."""
1225
authorized_units = set()
1226
authorized_keys = set()
1228
for relid in relation_ids(relation_types=replication_relation_types):
1229
for unit in relation_list(relid):
1230
relation = relation_get(unit_name=unit, relation_id=relid)
1231
public_key = relation.get('public_ssh_key', None)
1233
authorized_units.add(unit)
1234
authorized_keys.add(public_key)
1235
known_hosts.add('{} {}'.format(
1236
relation['private-address'], relation['ssh_host_key']))
1238
# Generate known_hosts
1240
'\n'.join(known_hosts), postgres_ssh_known_hosts,
1241
owner="postgres", group="postgres", mode=0o644)
1243
# Generate authorized_keys
1245
'\n'.join(authorized_keys), postgres_ssh_authorized_keys,
1246
owner="postgres", group="postgres", mode=0o400)
1248
# Publish details, so relation knows they have been granted access.
1249
local_state['authorized'] = authorized_units
1250
local_state.publish()
1253
def generate_pgpass(passwords):
1255
"*:*:*:{}:{}".format(username, password)
1256
for username, password in passwords.items())
1258
pgpass, postgres_pgpass,
1259
owner="postgres", group="postgres", mode=0o400)
1262
def generate_repmgr_config(node_id):
1263
"""Regenerate the repmgr config file.
1265
node_id is an integer, and must be a unique in the cluster.
1269
'node_name': os.environ['JUJU_UNIT_NAME'],
1270
'host': get_unit_host(),
1274
open("templates/repmgr.conf.tmpl").read()).render(params)
1276
config, repmgr_config, owner="postgres", group="postgres", mode=0o400)
1279
def run_repmgr(cmd, exit_on_error=True):
1280
full_command = "sudo -u postgres repmgr -f '{}' {}".format(
1282
juju_log(MSG_DEBUG, full_command)
1284
return subprocess.check_output(
1285
full_command, stderr=subprocess.STDOUT, shell=True)
1286
except subprocess.CalledProcessError, x:
1287
juju_log(MSG_ERROR, x.output)
1289
raise SystemExit(x.returncode)
1293
def drop_database(dbname, warn=True):
1299
db_cursor(autocommit=True).execute(
1300
'DROP DATABASE IF EXISTS "{}"'.format(dbname))
1301
except psycopg2.Error:
1302
if time.time() > now + timeout:
1304
juju_log(MSG_WARNING, "Unable to drop database %s" % dbname)
1312
def get_next_repmgr_node_id():
1313
# This hook does not run as ~postgres, so inform libpq where the
1315
os.environ['PGPASSFILE'] = postgres_pgpass
1317
host = get_unit_host()
1319
# A hot standby only calls this when setting up a relationship
1320
# with a master, so we assume the other end is the master if we
1322
host=relation_get('private-address')
1324
cur = db_cursor(autocommit=True, db='repmgr', user='repmgr', host=host)
1326
# We use a sequence for generating a unique id per node, as
1327
# required by repmgr. Create it if necessary.
1329
# TODO: Bug #806098 - there is no sane shared storage for
1330
# relation state, so we use a PostgreSQL sequence in our
1331
# replicated database. Using a sequence creates a race
1332
# condition where a new id is allocated on the master and we
1333
# failover before that information is replicated. This is
1334
# nearly impossible to hit. We could simply bump the sequence
1335
# by 100 after every failover.
1337
SELECT TRUE FROM information_schema.sequences
1338
WHERE sequence_catalog = 'repmgr' AND sequence_schema='public'
1339
AND sequence_name = 'juju_node_id'
1341
if cur.fetchone() is None:
1342
cur.execute('CREATE SEQUENCE juju_node_id')
1344
cur.execute("SELECT nextval('juju_node_id')")
1345
return cur.fetchone()[0]
1349
"""Remove old nodes from the repmgr database, tear down if no slaves"""
1351
for relid in relation_ids(replication_relation_types):
1352
wanted_units.extend(relation_list(relid))
1354
# If there are replication relationships, trash the local repmgr setup.
1355
if not wanted_units:
1356
# Restore a hot standby to a standalone configuration.
1357
if postgresql_is_in_recovery():
1358
pg_ctl = os.path.join(postgresql_bin_dir, 'pg_ctl')
1359
run("sudo -u postgres {} promote -D '{}'".format(
1360
pg_ctl, postgresql_cluster_dir))
1362
if os.path.exists(repmgr_config):
1363
juju_log(MSG_INFO, "No longer replicated. Dropping repmgr.")
1364
os.unlink(repmgr_config)
1366
if os.path.exists(postgres_pgpass):
1367
os.unlink(postgres_pgpass)
1369
drop_database('repmgr')
1371
local_state['state'] = 'standalone'
1374
elif is_master() and not postgresql_is_in_recovery():
1375
# There is at least one hot standby, and I'm the master.
1376
# Cleanup any dropped units from repmgr.
1377
wanted_units.append(os.environ['JUJU_UNIT_NAME'])
1379
MSG_DEBUG, "Remaining repmgr nodes are {}".format(
1380
', '.join(wanted_units)))
1381
cur = db_cursor(autocommit=True, db='repmgr')
1383
"DELETE FROM repmgr_juju.repl_nodes WHERE NOT ARRAY[name] <@ %s",
1388
'''True if we are, or should be, the master.
1390
Return True if I am the active master, or if neither myself nor
1391
the remote unit is and I win an election.
1393
master_relation_ids = relation_ids(relation_types=['master'])
1394
slave_relation_ids = relation_ids(relation_types=['slave'])
1395
if master_relation_ids and slave_relation_ids:
1396
# Both master and slave relations, so an attempt has been made
1397
# to set up cascading replication. This is not yet supported in
1398
# PostgreSQL, so we cannot support it either. Unfortunately,
1399
# there is no way yet to inform juju about this so we just have
1400
# to leave the impossible relation in a broken state.
1403
"Unable to create relationship. "
1404
"Cascading replication not supported.")
1407
if slave_relation_ids:
1408
# I'm explicitly the slave in a master/slave relationship.
1409
# No units in my service can be a master.
1412
# Do I think I'm the master?
1413
if local_state['state'] == 'master':
1416
# Lets see what out peer group thinks.
1418
for relid in relation_ids(relation_types=['replication']):
1419
# If there are any other peers claiming to be the master, then I am
1421
for unit in relation_list(relid):
1422
peer_units.add(unit)
1423
if relation_get('state', unit, relid) == 'master':
1426
# Are there other units? Maybe we are the only one left in the
1427
# various master/slave/replication relationships.
1429
for relid in relation_ids(relation_types=replication_relation_types):
1430
if relation_list(relid):
1434
juju_log(MSG_INFO, "I am alone, no point being a master")
1437
# There are no masters, so we need an election within this peer
1438
# relation. Lowest unit number wins and gets to be the master.
1439
remote_nums = sorted(int(unit.split('/', 1)[1]) for unit in peer_units)
1441
return True # Only unit in a service in a master relationship.
1442
my_num = int(os.environ['JUJU_UNIT_NAME'].split('/', 1)[1])
1443
if my_num < remote_nums[0]:
1449
def replication_relation_changed():
1450
ensure_local_ssh() # Generate SSH key and publish details
1451
authorize_remote_ssh() # Authorize relationship SSH keys.
1452
config_changed(postgresql_config) # Ensure minimal replication settings.
1456
relation = relation_get()
1459
if local_state['state'] == 'standalone': # Initial setup of a master.
1460
juju_log(MSG_INFO, "I am standalone and becoming the master")
1461
# The user repmgr connects as for both replication and
1463
repmgr_password = create_user(
1464
'repmgr', admin=True, replication=True)
1465
generate_pgpass(dict(repmgr=repmgr_password))
1466
drop_database('repmgr')
1467
ensure_database('repmgr', 'repmgr', 'repmgr')
1468
master_node_id = get_next_repmgr_node_id()
1469
generate_repmgr_config(master_node_id)
1470
run_repmgr('master register')
1471
local_state['state'] = 'master'
1472
local_state['repmgr_password'] = repmgr_password
1473
juju_log(MSG_INFO, "Publishing repmgr details to hot standbys")
1474
local_state.publish()
1476
elif local_state['state'] == 'master': # Already the master.
1477
juju_log(MSG_INFO, "I am the master")
1480
elif local_state['state'] == 'hot standby': # I've been promoted
1481
juju_log(MSG_INFO, "I am a hot standby being promoted to master")
1482
# Urgh. I can't just promote the hot standby to a master,
1483
# as it fails because the master db is still running alive
1484
# and well despite no longer being in the relation, due to
1485
# Bug #872264. repmgr thinks I'm trying to blow my foot off.
1486
# And I can't shoot it in the head if it is still alive,
1487
# because the master might be in a different service and we
1488
# want to keep it and its data alive (eg. replicating a
1489
# production database into a new service, then breaking the
1490
# relation and using it as a staging environment).
1491
# For now, we just attempt the promotion and fail if the
1492
# master is still alive; shutting down the spurious
1493
# PostgreSQL server and 'juju resolved --retry' will get
1494
# things back on track.
1496
run_repmgr('--verbose standby promote', exit_on_error=False)
1497
except subprocess.CalledProcessError, x:
1500
"Failed to promote. Is the old master still alive? "
1501
"Shut it down and 'juju resolved --retry' this "
1502
"relation to resolve.")
1503
raise SystemExit(x.returncode)
1504
local_state['state'] = 'master'
1505
local_state.publish()
1509
raise AssertionError(
1510
"Unknown state {}".format(local_state['state']))
1512
else: # A hot standby, now or soon.
1513
juju_log(MSG_INFO, "I am a hot standby")
1514
remote_is_master = (relation.get('state', '') == 'master')
1516
remote_has_authorized = False
1517
for unit in relation.get('authorized', '').split():
1518
if unit == os.environ['JUJU_UNIT_NAME']:
1519
remote_has_authorized = True
1521
if remote_is_master and remote_has_authorized:
1522
if local_state['state'] in ['standalone', 'master']:
1523
# Republish the repmgr password in case we failover to
1524
# being the master in the future. Bug #806098.
1525
local_state['repmgr_password'] = relation['repmgr_password']
1526
local_state.publish()
1528
# We are just joining replication, and have found a
1529
# master. Clone and follow it.
1530
generate_pgpass(dict(repmgr=relation['repmgr_password']))
1531
generate_repmgr_config(get_next_repmgr_node_id())
1533
# Before we start destroying anything, ensure that the
1534
# master is contactable.
1536
db='repmgr', user='repmgr',
1537
host=relation['private-address'])
1542
"Cloning master {}".format(os.environ['JUJU_REMOTE_UNIT']))
1543
# repmgr clone fails, even with --force specified, with
1544
# rsync errors if symlinks have been changed.
1545
if os.path.isdir(postgresql_cluster_dir):
1546
shutil.rmtree(postgresql_cluster_dir)
1549
'-D {} -d repmgr -p 5432 -U repmgr -R postgres '
1550
'--force standby clone {}'.format(
1551
postgresql_cluster_dir,
1552
relation['private-address']),
1553
exit_on_error=False)
1554
except subprocess.CalledProcessError:
1555
# We failed, and this cluster is broken. Rebuild a
1556
# working cluster so start/stop etc. works and we
1557
# can retry hooks again. Even assuming the charm is
1558
# functioning correctly, the clone may still fail
1559
# due to eg. lack of disk space.
1560
juju_log(MSG_ERROR, "Clone failed, db cluster destroyed")
1561
if os.path.exists(postgresql_cluster_dir):
1562
shutil.rmtree(postgresql_cluster_dir)
1563
if os.path.exists(postgresql_config_dir):
1564
shutil.rmtree(postgresql_config_dir)
1565
run('pg_createcluster 9.1 main')
1566
config_changed(postgresql_config)
1571
run_repmgr('standby register')
1572
juju_log(MSG_INFO, "Registered cluster with repmgr")
1573
local_state['state'] = 'hot standby'
1574
local_state['following'] = os.environ['JUJU_REMOTE_UNIT']
1575
local_state.publish()
1577
elif local_state['state'] == 'hot standby':
1578
if local_state['following'] != os.environ['JUJU_REMOTE_UNIT']:
1580
MSG_INFO, "New master {} found. Following".format(
1581
os.environ['JUJU_REMOTE_UNIT']))
1582
run_repmgr('standby follow', exit_on_error=True)
1583
local_state['following'] = os.environ['JUJU_REMOTE_UNIT']
1587
raise AssertionError(
1588
"Unknown state {}".format(local_state['state']))
1591
def replication_relation_broken():
1592
config_changed(postgresql_config)
1593
authorize_remote_ssh()
1599
for relid in relation_ids(relation_types=replication_relation_types):
1600
num_slaves += len(relation_list(relid))
1604
def postgresql_is_in_recovery():
1605
cur = db_cursor(autocommit=True)
1606
cur.execute("SELECT pg_is_in_recovery()")
1607
return cur.fetchone()[0]
1610
def postgresql_is_in_backup_mode():
1611
return os.path.exists(
1612
os.path.join(postgresql_cluster_dir, 'backup_label'))
1615
def wait_for_db(timeout=120, db='template1', user='postgres', host=None):
1616
'''Wait until the db is fully up.'''
1617
db_cursor(db=db, user=user, host=host, timeout=timeout)
916
1620
def update_nrpe_checks():
917
1621
config_data = config_get()
919
1623
nagios_uid = getpwnam('nagios').pw_uid
920
1624
nagios_gid = getgrnam('nagios').gr_gid
922
subprocess.call(['juju-log', "Nagios user not set up. Exiting."])
1626
juju_log(MSG_DEBUG, "Nagios user not set up. Exiting.")
925
1629
unit_name = os.environ['JUJU_UNIT_NAME'].replace('/', '-')