~dpb/charms/precise/postgresql/extra-packages

« back to all changes in this revision

Viewing changes to hooks/hooks.py

Committer: David Britton
Date: 2013-02-15 17:59:24 UTC
mfrom: (32.1.3 postgresql)
Revision ID: dpb@canonical.com-20130215175924-133bkjq79ew4jtpf

Merging up...

files added:
hooks/master-relation-broken

hooks/master-relation-changed

hooks/master-relation-joined

hooks/replication-relation-broken

hooks/replication-relation-changed

hooks/replication-relation-departed

hooks/replication-relation-joined

hooks/slave-relation-broken

hooks/slave-relation-changed

hooks/slave-relation-joined

revision

templates/repmgr.conf.tmpl

files modified:
README.markdown

TODO

config.yaml

hooks/hooks.py

metadata.yaml

templates/pg_hba.conf.tmpl

templates/postgresql.conf.tmpl

Show diffs side-by-side

added added

removed removed

hooks/hooks.py

#!/usr/bin/env python

# vim: et ai ts=4 sw=4:

import cPickle as pickle

import json

import yaml

import os

import glob

import random

import re

import shutil

import string

import socket

import subprocess

import sys

import time

import commands

from pwd import getpwnam

from grp import getgrnam

try:

import psycopg2

# jinja2 may not be importable until the install hook has installed the

# required packages.

def Template(*args, **kw):

from jinja2 import Template

except ImportError:

pass

return Template(*args, **kw)

###############################################################################

subprocess.call(['/usr/bin/juju-log', '-l', level, msg])

class State(dict):

"""Encapsulate state common to the unit for republishing to relations."""

def __init__(self, state_file):

self._state_file = state_file

self.load()

def load(self):

if os.path.exists(self._state_file):

state = pickle.load(open(self._state_file, 'rb'))

else:

state = {}

self.clear()

self.update(state)

def save(self):

state = {}

state.update(self)

pickle.dump(state, open(self._state_file, 'wb'))

def publish(self):

"""Publish relevant unit state to relations"""

def add(state_dict, key):

if self.has_key(key):

state_dict[key] = self[key]

client_state = {}

add(client_state, 'state')

for relid in relation_ids(relation_types=['db', 'db-admin']):

relation_set(client_state, relid)

replication_state = dict(client_state)

add(replication_state, 'public_ssh_key')

add(replication_state, 'ssh_host_key')

add(replication_state, 'repmgr_password')

authorized = self.get('authorized', None)

if authorized:

replication_state['authorized'] = ' '.join(authorized)

for relid in relation_ids(relation_types=replication_relation_types):

relation_set(replication_state, relid)

self.save()

###############################################################################

# Volume managment

150

204

#------------------------------------------------------------------------------

151

205

def run(command, exit_on_error=True):

152

206

try:

153

juju_log(MSG_INFO, command)

154

return subprocess.check_output(command, shell=True)

207

juju_log(MSG_DEBUG, command)

208

return subprocess.check_output(

209

command, stderr=subprocess.STDOUT, shell=True)

155

210

except subprocess.CalledProcessError, e:

156

211

juju_log(MSG_ERROR, "status=%d, output=%s" % (e.returncode, e.output))

157

212

if exit_on_error:

164

219

# install_file: install a file resource. overwites existing files.

165

220

#------------------------------------------------------------------------------

166

221

def install_file(contents, dest, owner="root", group="root", mode=0600):

167

uid = getpwnam(owner)[2]

168

gid = getgrnam(group)[2]

169

dest_fd = os.open(dest, os.O_WRONLY | os.O_TRUNC | os.O_CREAT, mode)

170

os.fchown(dest_fd, uid, gid)

171

with os.fdopen(dest_fd, 'w') as destfile:

172

destfile.write(str(contents))

222

uid = getpwnam(owner)[2]

223

gid = getgrnam(group)[2]

224

dest_fd = os.open(dest, os.O_WRONLY | os.O_TRUNC | os.O_CREAT, mode)

225

os.fchown(dest_fd, uid, gid)

226

with os.fdopen(dest_fd, 'w') as destfile:

227

destfile.write(str(contents))

173

228

174

229

175

230

#------------------------------------------------------------------------------

193

248

return False

194

249

# e.g. output: "Running clusters: 9.1/main"

195

250

vc = "%s/%s" % (config_data["version"], config_data["cluster_name"])

196

return vc in output.split()

251

return vc in output.decode('utf8').split()

197

252

198

253

199

254

def postgresql_stop():

206

261

def postgresql_start():

207

262

status, output = commands.getstatusoutput("invoke-rc.d postgresql start")

208

263

if status != 0:

264

juju_log(MSG_CRITICAL, output)

209

265

return False

210

266

return postgresql_is_running()

211

267

212

268

213

269

def postgresql_restart():

214

270

if postgresql_is_running():

271

# If the database is in backup mode, we don't want to restart

272

# PostgreSQL and abort the procedure. This may be another unit being

273

# cloned, or a filesystem level backup is being made. There is no

274

# timeout here, as backups can take hours or days. Instead, keep

275

# logging so admins know wtf is going on.

276

last_warning = time.time()

277

while postgresql_is_in_backup_mode():

278

if time.time() + 120 > last_warning:

279

juju_log(

280

MSG_WARNING,

281

"In backup mode. PostgreSQL restart blocked.")

282

juju_log(

283

MSG_INFO,

284

"Run \"psql -U postgres -c 'SELECT pg_stop_backup()'\""

285

"to cancel backup mode and forcefully unblock this hook.")

286

last_warning = time.time()

287

time.sleep(5)

288

215

289

status, output = \

216

290

commands.getstatusoutput("invoke-rc.d postgresql restart")

217

291

if status != 0:

218

292

return False

219

293

else:

220

294

postgresql_start()

295

296

# Store a copy of our known live configuration so

297

# postgresql_reload_or_restart() can make good choices.

298

if local_state.has_key('saved_config'):

299

local_state['live_config'] = local_state['saved_config']

300

local_state.save()

301

221

302

return postgresql_is_running()

222

303

223

304

227

308

return (status == 0)

228

309

229

310

311

def postgresql_reload_or_restart():

312

"""Reload PostgreSQL configuration, restarting if necessary."""

313

# Pull in current values of settings that can only be changed on

314

# server restart.

315

if not postgresql_is_running():

316

return postgresql_restart()

317

318

# Suck in the config last written to postgresql.conf.

319

saved_config = local_state.get('saved_config', None)

320

if not saved_config:

321

# No record of postgresql.conf state, perhaps an upgrade.

322

# Better restart.

323

return postgresql_restart()

324

325

# Suck in our live config from last time we restarted.

326

live_config = local_state.setdefault('live_config', {})

327

328

# Pull in a list of PostgreSQL settings.

329

cur = db_cursor()

330

cur.execute("SELECT name, context FROM pg_settings")

331

requires_restart = False

332

for name, context in cur.fetchall():

333

live_value = live_config.get(name, None)

334

new_value = saved_config.get(name, None)

335

336

if new_value != live_value:

337

if live_config:

338

juju_log(

339

MSG_DEBUG, "Changed {} from {} to {}".format(

340

name, repr(live_value), repr(new_value)))

341

if context == 'postmaster':

342

# A setting has changed that requires PostgreSQL to be

343

# restarted before it will take effect.

344

requires_restart = True

345

346

if requires_restart:

347

# A change has been requested that requires a restart.

348

juju_log(

349

MSG_WARNING,

350

"Configuration change requires PostgreSQL restart. "

351

"Restarting.")

352

rc = postgresql_restart()

353

else:

354

juju_log(

355

MSG_DEBUG, "PostgreSQL reload, config changes taking effect.")

356

rc = postgresql_reload() # No pending need to bounce, just reload.

357

358

if rc == 0 and local_state.has_key('saved_config'):

359

local_state['live_config'] = local_state['saved_config']

360

local_state.save()

361

362

return rc

363

364

230

365

#------------------------------------------------------------------------------

231

366

# config_get: Returns a dictionary containing all of the config information

232

367

# Optional parameter: scope

273

408

# relation_id: specify relation id for out of context usage.

274

409

#------------------------------------------------------------------------------

275

410

def relation_json(scope=None, unit_name=None, relation_id=None):

276

try:

277

relation_cmd_line = ['relation-get', '--format=json']

278

if relation_id is not None:

279

relation_cmd_line.extend(('-r', relation_id))

280

if scope is not None:

281

relation_cmd_line.append(scope)

282

else:

283

relation_cmd_line.append('-')

284

relation_cmd_line.append(unit_name)

285

relation_data = run(" ".join(relation_cmd_line), exit_on_error=False)

286

except:

287

relation_data = None

288

finally:

289

return(relation_data)

411

command = ['relation-get', '--format=json']

412

if relation_id is not None:

413

command.extend(('-r', relation_id))

414

if scope is not None:

415

command.append(scope)

416

else:

417

command.append('-')

418

if unit_name is not None:

419

command.append(unit_name)

420

output = subprocess.check_output(command, stderr=subprocess.STDOUT)

421

return output or None

290

422

291

423

292

424

#------------------------------------------------------------------------------

297

429

# unit_name: limits the data ( and optionally the scope )

298

430

# to the specified unit

299

431

#------------------------------------------------------------------------------

300

def relation_get(scope=None, unit_name=None):

301

try:

302

relation_cmd_line = ['relation-get', '--format=json']

303

if scope is not None:

304

relation_cmd_line.append(scope)

305

else:

306

relation_cmd_line.append('')

307

if unit_name is not None:

308

relation_cmd_line.append(unit_name)

309

relation_data = json.loads(subprocess.check_output(relation_cmd_line))

310

except:

311

relation_data = None

312

finally:

313

return(relation_data)

432

def relation_get(scope=None, unit_name=None, relation_id=None):

433

j = relation_json(scope, unit_name, relation_id)

434

if j:

435

return json.loads(j)

436

else:

437

return None

438

439

440

def relation_set(keyvalues, relation_id=None):

441

args = []

442

if relation_id:

443

args.extend(['-r', relation_id])

444

args.extend(["{}='{}'".format(k, v or '') for k, v in keyvalues.items()])

445

run("relation-set {}".format(' '.join(args)))

446

447

## Posting json to relation-set doesn't seem to work as documented?

448

## Bug #1116179

449

450

## cmd = ['relation-set']

451

## if relation_id:

452

## cmd.extend(['-r', relation_id])

453

## p = Popen(

454

## cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE,

455

## stderr=subprocess.PIPE)

456

## (out, err) = p.communicate(json.dumps(keyvalues))

457

## if p.returncode:

458

## juju_log(MSG_ERROR, err)

459

## sys.exit(1)

460

## juju_log(MSG_DEBUG, "relation-set {}".format(repr(keyvalues)))

461

462

463

def relation_list(relation_id=None):

464

"""Return the list of units participating in the relation."""

465

if relation_id is None:

466

relation_id = os.environ['JUJU_RELATION_ID']

467

cmd = ['relation-list', '--format=json', '-r', relation_id]

468

json_units = subprocess.check_output(cmd).strip()

469

if json_units:

470

return json.loads(subprocess.check_output(cmd))

471

return []

314

472

315

473

316

474

#------------------------------------------------------------------------------

318

476

# optional parameters: relation_type

319

477

# relation_type: return relations only of this type

320

478

#------------------------------------------------------------------------------

321

def relation_ids(relation_types=['db']):

479

def relation_ids(relation_types=('db',)):

322

480

# accept strings or iterators

323

481

if isinstance(relation_types, basestring):

324

482

reltypes = [relation_types, ]

327

485

relids = []

328

486

for reltype in reltypes:

329

487

relid_cmd_line = ['relation-ids', '--format=json', reltype]

330

relids.extend(json.loads(subprocess.check_output(relid_cmd_line)))

488

json_relids = subprocess.check_output(relid_cmd_line).strip()

489

if json_relids:

490

relids.extend(json.loads(json_relids))

331

491

return relids

332

492

333

493

339

499

#------------------------------------------------------------------------------

340

500

def relation_get_all(*args, **kwargs):

341

501

relation_data = []

342

try:

343

relids = relation_ids(*args, **kwargs)

344

for relid in relids:

345

units_cmd_line = ['relation-list', '--format=json', '-r', relid]

346

units = json.loads(subprocess.check_output(units_cmd_line))

347

for unit in units:

502

relids = relation_ids(*args, **kwargs)

503

for relid in relids:

504

units_cmd_line = ['relation-list', '--format=json', '-r', relid]

505

json_units = subprocess.check_output(units_cmd_line).strip()

506

if json_units:

507

for unit in json.loads(json_units):

348

508

unit_data = \

349

509

json.loads(relation_json(relation_id=relid,

350

510

unit_name=unit))

354

514

unit_data['relation-id'] = relid

355

515

unit_data['unit'] = unit

356

516

relation_data.append(unit_data)

357

except Exception, e:

358

subprocess.call(['juju-log', str(e)])

359

relation_data = []

360

finally:

361

return(relation_data)

517

return relation_data

362

518

363

519

364

520

#------------------------------------------------------------------------------

401

557

((int(total_ram) * 1024 * 1024) + 1024),)

402

558

conf_file.close()

403

559

run("sysctl -p /etc/sysctl.d/50-postgresql.conf")

560

561

# If we are replicating, some settings may need to be overridden to

562

# certain minimum levels.

563

num_slaves = slave_count()

564

if num_slaves > 0:

565

juju_log(

566

MSG_INFO, 'Master replicated to {} hot standbys.'.format(

567

num_slaves))

568

juju_log(MSG_INFO, 'Ensuring minimal replication settings')

569

config_data['hot_standby'] = 'on'

570

config_data['wal_level'] = 'hot_standby'

571

if config_data['archive_mode'] is False:

572

# If archive_mode was not configured, we need to override it

573

# to keep repmgr happy despite the fact it doesn't really

574

# need it. We also need set a noop archive_command. If

575

# archive_mode was already set, we don't mess with the

576

# archive_command setting.

577

config_data['archive_mode'] = 'True'

578

if not config_data['archive_command']:

579

config_data['archive_command'] = 'cd .'

580

config_data['max_wal_senders'] = max(

581

num_slaves, config_data['max_wal_senders'])

582

config_data['wal_keep_segments'] = max(

583

config_data['wal_keep_segments'],

584

config_data['replicated_wal_keep_segments'])

585

404

586

# Send config data to the template

405

587

# Return it as pg_config

406

pg_config = \

407

Template(

588

pg_config = Template(

408

589

open("templates/postgresql.conf.tmpl").read()).render(config_data)

409

590

install_file(pg_config, postgresql_config)

410

591

592

local_state['saved_config'] = config_data

593

local_state.save()

594

411

595

412

596

#------------------------------------------------------------------------------

413

597

# create_postgresql_ident: Creates the pg_ident.conf file

425

609

# generate_postgresql_hba: Creates the pg_hba.conf file

426

610

#------------------------------------------------------------------------------

427

611

def generate_postgresql_hba(postgresql_hba):

612

613

# Per Bug #1117542, when generating the postgresql_hba file we

614

# need to cope with private-address being either an IP address

615

# or a hostname.

616

def munge_address(addr):

617

# http://stackoverflow.com/q/319279/196832

618

try:

619

socket.inet_aton(addr)

620

return "%s/32" % addr

621

except socket.error:

622

# It's not an IP address.

623

return addr

624

428

625

relation_data = relation_get_all(relation_types=['db', 'db-admin'])

429

import socket

430

626

for relation in relation_data:

431

if re.search('db-admin', relation['relation-id']):

627

relation_id = relation['relation-id']

628

if relation_id.startswith('db-admin:'):

432

629

relation['user'] = 'all'

433

630

relation['database'] = 'all'

434

else:

631

elif relation_id.startswith('db:'):

435

632

relation['user'] = user_name(relation['relation-id'],

436

633

relation['unit'])

437

634

relation['schema_user'] = user_name(relation['relation-id'],

438

635

relation['unit'],

439

636

schema=True)

440

# LP:1117542 - http://stackoverflow.com/q/319279/196832

441

try:

442

socket.inet_aton(relation['private-address'])

443

relation['private-address'] = "%s/32" % relation['private-address']

444

except socket.error:

445

# It's not an IP address.

446

pass

637

else:

638

raise RuntimeError(

639

'Unknown relation type {}'.format(repr(relation_id)))

640

641

relation['private-address'] = munge_address(

642

relation['private-address'])

447

643

448

644

juju_log(MSG_INFO, str(relation_data))

645

646

# Replication connections. Each unit needs to be able to connect to

647

# every other unit's repmgr database and the magic replication

648

# database. It also needs to be able to connect to its own repmgr

649

# database.

650

replication_relations = relation_get_all(

651

relation_types=replication_relation_types)

652

for relation in replication_relations:

653

remote_addr = munge_address(relation['private-address'])

654

remote_replication = {

655

'database': 'replication', 'user': 'repmgr',

656

'private-address': remote_addr,

657

'relation-id': relation['relation-id'],

658

'unit': relation['private-address'],

659

}

660

relation_data.append(remote_replication)

661

remote_repmgr = {

662

'database': 'repmgr', 'user': 'repmgr',

663

'private-address': remote_addr,

664

'relation-id': relation['relation-id'],

665

'unit': relation['private-address'],

666

}

667

relation_data.append(remote_repmgr)

668

if replication_relations:

669

local_repmgr = {

670

'database': 'repmgr', 'user': 'repmgr',

671

'private-address': munge_address(get_unit_host()),

672

'relation-id': relation['relation-id'],

673

'unit': get_unit_host(),

674

}

675

relation_data.append(local_repmgr)

676

449

677

pg_hba_template = Template(

450

678

open("templates/pg_hba.conf.tmpl").read()).render(

451

access_list= relation_data)

679

access_list=relation_data)

452

680

with open(postgresql_hba, 'w') as hba_file:

453

681

hba_file.write(str(pg_hba_template))

454

# hba_conf changes do not need full db restarts

455

subprocess.call(['invoke-rc.d', 'postgresql', 'reload'])

682

postgresql_reload()

683

456

684

457

685

458

686

#------------------------------------------------------------------------------

464

692

'scripts_dir': postgresql_scripts_dir,

465

693

'backup_days': config_data["backup_retention_count"],

466

694

}

467

from jinja2 import Template

468

695

crontab_template = Template(

469

696

open("templates/postgres.cron.tmpl").read()).render(crontab_data)

470

697

install_file(str(crontab_template), "/etc/cron.d/postgres", mode=0644)

552

779

return None

553

780

554

781

555

def db_cursor(autocommit=False):

556

conn = psycopg2.connect("dbname=template1 user=postgres")

782

def db_cursor(

783

autocommit=False, db='template1', user='postgres', host=None, timeout=120):

784

import psycopg2

785

if host:

786

conn_str = "dbname={} host={} user={}".format(db, host, user)

787

else:

788

conn_str = "dbname={} user={}".format(db, user)

789

# There are often race conditions in opening database connections,

790

# such as a reload having just happened to change pg_hba.conf

791

# settings or a hot standby being restarted and needing to catch up

792

# with its master. To protect our automation against these sorts of

793

# race conditions, by default we always retry failed connections

794

# until a timeout is reached.

795

start = time.time()

796

while True:

797

try:

798

conn = psycopg2.connect(conn_str)

799

break

800

except psycopg2.Error:

801

if time.time() > start + timeout:

802

raise

803

time.sleep(0.3)

557

804

conn.autocommit = autocommit

558

805

return conn.cursor()

559

#try:

560

#return conn.cursor()

561

#except psycopg2.ProgrammingError:

562

#print sql

563

#raise

564

806

565

807

566

808

def run_sql_as_postgres(sql, *parameters):

809

import psycopg2

567

810

cur = db_cursor(autocommit=True)

568

811

try:

569

812

cur.execute(sql, parameters)

570

813

return cur.statusmessage

571

814

except psycopg2.ProgrammingError:

572

print sql

815

juju_log(MSG_CRITICAL, sql)

573

816

raise

574

817

575

818

576

819

def run_select_as_postgres(sql, *parameters):

577

820

cur = db_cursor()

578

821

cur.execute(sql, parameters)

579

return (cur.rowcount, cur.fetchall())

822

# NB. Need to suck in the results before the rowcount is valid.

823

results = cur.fetchall()

824

return (cur.rowcount, results)

580

825

581

826

582

827

#------------------------------------------------------------------------------

683

928

###############################################################################

684

929

# Hook functions

685

930

###############################################################################

686

def config_changed(postgresql_config):

687

config_change_command = config_data["config_change_command"]

931

def config_changed(postgresql_config, force_restart=False):

688

932

# Trigger volume initialization logic for permanent storage

689

933

volid = volume_get_volume_id()

690

934

if not volid:

691

## Invalid configuration (wether ephemeral, or permanent)

935

## Invalid configuration (whether ephemeral, or permanent)

692

936

disable_service_start("postgresql")

693

937

postgresql_stop()

694

938

mounts = volume_get_all_mounted()

705

949

## it necessary, ie: new volume setup

706

950

if config_changed_volume_apply():

707

951

enable_service_start("postgresql")

708

config_change_command = "restart"

952

force_restart = True

709

953

else:

710

954

disable_service_start("postgresql")

711

955

postgresql_stop()

723

967

updated_service_port = config_data["listen_port"]

724

968

update_service_port(current_service_port, updated_service_port)

725

969

update_nrpe_checks()

726

juju_log(MSG_INFO,

727

"about reconfigure service with config_change_command = '%s'" %

728

config_change_command)

729

if config_change_command == "reload":

730

return postgresql_reload()

731

elif config_change_command == "restart":

970

if force_restart:

732

971

return postgresql_restart()

733

juju_log(MSG_ERROR, "invalid config_change_command = '%s'" %

734

config_change_command)

735

return False

972

return postgresql_reload_or_restart()

736

973

737

974

738

975

def token_sql_safe(value):

747

984

for f in glob.glob('exec.d/*/charm-pre-install'):

748

985

if os.path.isfile(f) and os.access(f, os.X_OK):

749

986

subprocess.check_call(['sh', '-c', f])

987

988

# Intialize local state.

989

local_state.setdefault('state', 'standalone')

990

local_state.publish()

991

750

992

packages = ["postgresql", "pwgen", "python-jinja2", "syslinux",

751

993

"python-psycopg2", "postgresql-contrib", "postgresql-plpython",

752

994

"postgresql-%s-debversion" % config_data["version"]]

753

995

packages.extend(config_data["extra-packages"].split())

754

996

apt_get_install(packages)

755

997

756

from jinja2 import Template

757

998

install_dir(postgresql_backups_dir, owner="postgres", mode=0755)

758

999

install_dir(postgresql_scripts_dir, owner="postgres", mode=0755)

759

1000

install_dir(postgresql_logs_dir, owner="postgres", mode=0755)

774

1015

install_postgresql_crontab(postgresql_crontab)

775

1016

open_port(5432)

776

1017

1018

# Ensure at least minimal access granted for hooks to run.

1019

# Reload because we are using the default cluster setup and started

1020

# when we installed the PostgreSQL packages.

1021

config_changed(postgresql_config, force_restart=True)

1022

1023

# repmgr needs to find pg_ctl in the PATH.

1024

run(

1025

"update-alternatives --install /usr/local/bin/pg_ctl "

1026

"pg_ctl {}/pg_ctl 50".format(postgresql_bin_dir))

1027

777

1028

778

1029

def user_name(relid, remote_unit, admin=False, schema=False):

779

components = []

780

components.append(relid.replace(":", "_").replace("-", "_"))

781

components.append(remote_unit.replace("/", "_").replace("-", "_"))

1030

def sanitize(s):

1031

s = s.replace(':', '_')

1032

s = s.replace('-', '_')

1033

s = s.replace('/', '_')

1034

s = s.replace('"', '_')

1035

s = s.replace("'", '_')

1036

return s

1037

components = [sanitize(relid), sanitize(remote_unit)]

782

1038

if admin:

783

1039

components.append("admin")

784

1040

elif schema:

787

1043

788

1044

789

1045

def database_names(admin=False):

790

omit_tables = ['template0', 'template1']

1046

omit_tables = ['template0', 'template1', 'repmgr']

791

1047

sql = \

792

1048

"SELECT datname FROM pg_database WHERE datname NOT IN (" + \

793

1049

",".join(["%s"] * len(omit_tables)) + ")"

802

1058

return False

803

1059

804

1060

805

def create_user(user, admin=False):

1061

def create_user(user, admin=False, replication=False):

806

1062

password = get_password(user)

807

1063

if password is None:

808

1064

password = pwgen()

809

1065

set_password(user, password)

810

action = "CREATE"

811

1066

if user_exists(user):

812

action = "ALTER"

1067

action = ["ALTER ROLE"]

1068

else:

1069

action = ["CREATE ROLE"]

1070

action.append('"{}"'.format(user))

1071

action.append('WITH LOGIN')

813

1072

if admin:

814

sql = "{} USER {} SUPERUSER PASSWORD %s".format(action, user)

815

else:

816

sql = "{} USER {} PASSWORD %s".format(action, user)

1073

action.append('SUPERUSER')

1074

else:

1075

action.append('NOSUPERUSER')

1076

if replication:

1077

action.append('REPLICATION')

1078

else:

1079

action.append('NOREPLICATION')

1080

action.append('PASSWORD %s')

1081

sql = ' '.join(action)

817

1082

run_sql_as_postgres(sql, password)

818

1083

return password

819

1084

913

1178

generate_postgresql_hba(postgresql_hba)

914

1179

915

1180

1181

def TODO(msg):

1182

juju_log(MSG_WARNING, 'TODO> %s' % msg)

1183

1184

1185

def install_repmgr():

1186

'''Install the repmgr package if it isn't already.'''

1187

extra_repos = config_get('extra_archives')

1188

extra_repos_added = local_state.setdefault('extra_repos_added', set())

1189

if extra_repos:

1190

repos_added = False

1191

for repo in extra_repos.split():

1192

if repo not in extra_repos_added:

1193

run("add-apt-repository --yes '{}'".format(repo))

1194

extra_repos_added.add(repo)

1195

repos_added = True

1196

if repos_added:

1197

run('apt-get update')

1198

local_state.save()

1199

apt_get_install('repmgr')

1200

apt_get_install('postgresql-9.1-repmgr')

1201

1202

1203

def ensure_local_ssh():

1204

"""Generate SSH keys for postgres user.

1205

1206

The public key is stored in public_ssh_key on the relation.

1207

1208

Bidirectional SSH access is required by repmgr.

1209

"""

1210

comment = 'repmgr key for {}'.format(os.environ['JUJU_UNIT_NAME'])

1211

if not os.path.isdir(postgres_ssh_dir):

1212

install_dir(postgres_ssh_dir, "postgres", "postgres", 0700)

1213

if not os.path.exists(postgres_ssh_private_key):

1214

run("sudo -u postgres -H ssh-keygen -q -t rsa -C '{}' -N '' "

1215

"-f '{}'".format(comment, postgres_ssh_private_key))

1216

public_key = open(postgres_ssh_public_key, 'r').read().strip()

1217

host_key = open('/etc/ssh/ssh_host_ecdsa_key.pub').read().strip()

1218

local_state['public_ssh_key'] = public_key

1219

local_state['ssh_host_key'] = host_key

1220

local_state.publish()

1221

1222

1223

def authorize_remote_ssh():

1224

"""Generate the SSH authorized_keys file."""

1225

authorized_units = set()

1226

authorized_keys = set()

1227

known_hosts = set()

1228

for relid in relation_ids(relation_types=replication_relation_types):

1229

for unit in relation_list(relid):

1230

relation = relation_get(unit_name=unit, relation_id=relid)

1231

public_key = relation.get('public_ssh_key', None)

1232

if public_key:

1233

authorized_units.add(unit)

1234

authorized_keys.add(public_key)

1235

known_hosts.add('{} {}'.format(

1236

relation['private-address'], relation['ssh_host_key']))

1237

1238

# Generate known_hosts

1239

install_file(

1240

'\n'.join(known_hosts), postgres_ssh_known_hosts,

1241

owner="postgres", group="postgres", mode=0o644)

1242

1243

# Generate authorized_keys

1244

install_file(

1245

'\n'.join(authorized_keys), postgres_ssh_authorized_keys,

1246

owner="postgres", group="postgres", mode=0o400)

1247

1248

# Publish details, so relation knows they have been granted access.

1249

local_state['authorized'] = authorized_units

1250

local_state.publish()

1251

1252

1253

def generate_pgpass(passwords):

1254

pgpass = '\n'.join(

1255

"*:*:*:{}:{}".format(username, password)

1256

for username, password in passwords.items())

1257

install_file(

1258

pgpass, postgres_pgpass,

1259

owner="postgres", group="postgres", mode=0o400)

1260

1261

1262

def generate_repmgr_config(node_id):

1263

"""Regenerate the repmgr config file.

1264

1265

node_id is an integer, and must be a unique in the cluster.

1266

"""

1267

params = {

1268

'node_id': node_id,

1269

'node_name': os.environ['JUJU_UNIT_NAME'],

1270

'host': get_unit_host(),

1271

'user': 'repmgr',

1272

}

1273

config = Template(

1274

open("templates/repmgr.conf.tmpl").read()).render(params)

1275

install_file(

1276

config, repmgr_config, owner="postgres", group="postgres", mode=0o400)

1277

1278

1279

def run_repmgr(cmd, exit_on_error=True):

1280

full_command = "sudo -u postgres repmgr -f '{}' {}".format(

1281

repmgr_config, cmd)

1282

juju_log(MSG_DEBUG, full_command)

1283

try:

1284

return subprocess.check_output(

1285

full_command, stderr=subprocess.STDOUT, shell=True)

1286

except subprocess.CalledProcessError, x:

1287

juju_log(MSG_ERROR, x.output)

1288

if exit_on_error:

1289

raise SystemExit(x.returncode)

1290

raise

1291

1292

1293

def drop_database(dbname, warn=True):

1294

import psycopg2

1295

timeout = 120

1296

now = time.time()

1297

while True:

1298

try:

1299

db_cursor(autocommit=True).execute(

1300

'DROP DATABASE IF EXISTS "{}"'.format(dbname))

1301

except psycopg2.Error:

1302

if time.time() > now + timeout:

1303

if warn:

1304

juju_log(MSG_WARNING, "Unable to drop database %s" % dbname)

1305

else:

1306

raise

1307

time.sleep(0.5)

1308

else:

1309

break

1310

1311

1312

def get_next_repmgr_node_id():

1313

# This hook does not run as ~postgres, so inform libpq where the

1314

# password file is.

1315

os.environ['PGPASSFILE'] = postgres_pgpass

1316

if is_master():

1317

host = get_unit_host()

1318

else:

1319

# A hot standby only calls this when setting up a relationship

1320

# with a master, so we assume the other end is the master if we

1321

# are not.

1322

host=relation_get('private-address')

1323

1324

cur = db_cursor(autocommit=True, db='repmgr', user='repmgr', host=host)

1325

1326

# We use a sequence for generating a unique id per node, as

1327

# required by repmgr. Create it if necessary.

1328

1329

# TODO: Bug #806098 - there is no sane shared storage for

1330

# relation state, so we use a PostgreSQL sequence in our

1331

# replicated database. Using a sequence creates a race

1332

# condition where a new id is allocated on the master and we

1333

# failover before that information is replicated. This is

1334

# nearly impossible to hit. We could simply bump the sequence

1335

# by 100 after every failover.

1336

cur.execute('''

1337

SELECT TRUE FROM information_schema.sequences

1338

WHERE sequence_catalog = 'repmgr' AND sequence_schema='public'

1339

AND sequence_name = 'juju_node_id'

1340

''')

1341

if cur.fetchone() is None:

1342

cur.execute('CREATE SEQUENCE juju_node_id')

1343

1344

cur.execute("SELECT nextval('juju_node_id')")

1345

return cur.fetchone()[0]

1346

1347

1348

def repmgr_gc():

1349

"""Remove old nodes from the repmgr database, tear down if no slaves"""

1350

wanted_units = []

1351

for relid in relation_ids(replication_relation_types):

1352

wanted_units.extend(relation_list(relid))

1353

1354

# If there are replication relationships, trash the local repmgr setup.

1355

if not wanted_units:

1356

# Restore a hot standby to a standalone configuration.

1357

if postgresql_is_in_recovery():

1358

pg_ctl = os.path.join(postgresql_bin_dir, 'pg_ctl')

1359

run("sudo -u postgres {} promote -D '{}'".format(

1360

pg_ctl, postgresql_cluster_dir))

1361

1362

if os.path.exists(repmgr_config):

1363

juju_log(MSG_INFO, "No longer replicated. Dropping repmgr.")

1364

os.unlink(repmgr_config)

1365

1366

if os.path.exists(postgres_pgpass):

1367

os.unlink(postgres_pgpass)

1368

1369

drop_database('repmgr')

1370

1371

local_state['state'] = 'standalone'

1372

1373

1374

elif is_master() and not postgresql_is_in_recovery():

1375

# There is at least one hot standby, and I'm the master.

1376

# Cleanup any dropped units from repmgr.

1377

wanted_units.append(os.environ['JUJU_UNIT_NAME'])

1378

juju_log(

1379

MSG_DEBUG, "Remaining repmgr nodes are {}".format(

1380

', '.join(wanted_units)))

1381

cur = db_cursor(autocommit=True, db='repmgr')

1382

cur.execute(

1383

"DELETE FROM repmgr_juju.repl_nodes WHERE NOT ARRAY[name] <@ %s",

1384

(wanted_units,))

1385

1386

1387

def is_master():

1388

'''True if we are, or should be, the master.

1389

1390

Return True if I am the active master, or if neither myself nor

1391

the remote unit is and I win an election.

1392

'''

1393

master_relation_ids = relation_ids(relation_types=['master'])

1394

slave_relation_ids = relation_ids(relation_types=['slave'])

1395

if master_relation_ids and slave_relation_ids:

1396

# Both master and slave relations, so an attempt has been made

1397

# to set up cascading replication. This is not yet supported in

1398

# PostgreSQL, so we cannot support it either. Unfortunately,

1399

# there is no way yet to inform juju about this so we just have

1400

# to leave the impossible relation in a broken state.

1401

juju_log(

1402

MSG_CRITICAL,

1403

"Unable to create relationship. "

1404

"Cascading replication not supported.")

1405

raise SystemExit(1)

1406

1407

if slave_relation_ids:

1408

# I'm explicitly the slave in a master/slave relationship.

1409

# No units in my service can be a master.

1410

return False

1411

1412

# Do I think I'm the master?

1413

if local_state['state'] == 'master':

1414

return True

1415

1416

# Lets see what out peer group thinks.

1417

peer_units = set()

1418

for relid in relation_ids(relation_types=['replication']):

1419

# If there are any other peers claiming to be the master, then I am

1420

# not the master.

1421

for unit in relation_list(relid):

1422

peer_units.add(unit)

1423

if relation_get('state', unit, relid) == 'master':

1424

return False

1425

1426

# Are there other units? Maybe we are the only one left in the

1427

# various master/slave/replication relationships.

1428

alone = True

1429

for relid in relation_ids(relation_types=replication_relation_types):

1430

if relation_list(relid):

1431

alone = False

1432

break

1433

if alone:

1434

juju_log(MSG_INFO, "I am alone, no point being a master")

1435

return False

1436

1437

# There are no masters, so we need an election within this peer

1438

# relation. Lowest unit number wins and gets to be the master.

1439

remote_nums = sorted(int(unit.split('/', 1)[1]) for unit in peer_units)

1440

if not remote_nums:

1441

return True # Only unit in a service in a master relationship.

1442

my_num = int(os.environ['JUJU_UNIT_NAME'].split('/', 1)[1])

1443

if my_num < remote_nums[0]:

1444

return True

1445

else:

1446

return False

1447

1448

1449

def replication_relation_changed():

1450

ensure_local_ssh() # Generate SSH key and publish details

1451

authorize_remote_ssh() # Authorize relationship SSH keys.

1452

config_changed(postgresql_config) # Ensure minimal replication settings.

1453

1454

install_repmgr()

1455

1456

relation = relation_get()

1457

1458

if is_master():

1459

if local_state['state'] == 'standalone': # Initial setup of a master.

1460

juju_log(MSG_INFO, "I am standalone and becoming the master")

1461

# The user repmgr connects as for both replication and

1462

# administration.

1463

repmgr_password = create_user(

1464

'repmgr', admin=True, replication=True)

1465

generate_pgpass(dict(repmgr=repmgr_password))

1466

drop_database('repmgr')

1467

ensure_database('repmgr', 'repmgr', 'repmgr')

1468

master_node_id = get_next_repmgr_node_id()

1469

generate_repmgr_config(master_node_id)

1470

run_repmgr('master register')

1471

local_state['state'] = 'master'

1472

local_state['repmgr_password'] = repmgr_password

1473

juju_log(MSG_INFO, "Publishing repmgr details to hot standbys")

1474

local_state.publish()

1475

1476

elif local_state['state'] == 'master': # Already the master.

1477

juju_log(MSG_INFO, "I am the master")

1478

repmgr_gc()

1479

1480

elif local_state['state'] == 'hot standby': # I've been promoted

1481

juju_log(MSG_INFO, "I am a hot standby being promoted to master")

1482

# Urgh. I can't just promote the hot standby to a master,

1483

# as it fails because the master db is still running alive

1484

# and well despite no longer being in the relation, due to

1485

# Bug #872264. repmgr thinks I'm trying to blow my foot off.

1486

# And I can't shoot it in the head if it is still alive,

1487

# because the master might be in a different service and we

1488

# want to keep it and its data alive (eg. replicating a

1489

# production database into a new service, then breaking the

1490

# relation and using it as a staging environment).

1491

# For now, we just attempt the promotion and fail if the

1492

# master is still alive; shutting down the spurious

1493

# PostgreSQL server and 'juju resolved --retry' will get

1494

# things back on track.

1495

try:

1496

run_repmgr('--verbose standby promote', exit_on_error=False)

1497

except subprocess.CalledProcessError, x:

1498

juju_log(

1499

MSG_CRITICAL,

1500

"Failed to promote. Is the old master still alive? "

1501

"Shut it down and 'juju resolved --retry' this "

1502

"relation to resolve.")

1503

raise SystemExit(x.returncode)

1504

local_state['state'] = 'master'

1505

local_state.publish()

1506

repmgr_gc()

1507

1508

else:

1509

raise AssertionError(

1510

"Unknown state {}".format(local_state['state']))

1511

1512

else: # A hot standby, now or soon.

1513

juju_log(MSG_INFO, "I am a hot standby")

1514

remote_is_master = (relation.get('state', '') == 'master')

1515

1516

remote_has_authorized = False

1517

for unit in relation.get('authorized', '').split():

1518

if unit == os.environ['JUJU_UNIT_NAME']:

1519

remote_has_authorized = True

1520

1521

if remote_is_master and remote_has_authorized:

1522

if local_state['state'] in ['standalone', 'master']:

1523

# Republish the repmgr password in case we failover to

1524

# being the master in the future. Bug #806098.

1525

local_state['repmgr_password'] = relation['repmgr_password']

1526

local_state.publish()

1527

1528

# We are just joining replication, and have found a

1529

# master. Clone and follow it.

1530

generate_pgpass(dict(repmgr=relation['repmgr_password']))

1531

generate_repmgr_config(get_next_repmgr_node_id())

1532

1533

# Before we start destroying anything, ensure that the

1534

# master is contactable.

1535

wait_for_db(

1536

db='repmgr', user='repmgr',

1537

host=relation['private-address'])

1538

1539

postgresql_stop()

1540

juju_log(

1541

MSG_INFO,

1542

"Cloning master {}".format(os.environ['JUJU_REMOTE_UNIT']))

1543

# repmgr clone fails, even with --force specified, with

1544

# rsync errors if symlinks have been changed.

1545

if os.path.isdir(postgresql_cluster_dir):

1546

shutil.rmtree(postgresql_cluster_dir)

1547

try:

1548

run_repmgr(

1549

'-D {} -d repmgr -p 5432 -U repmgr -R postgres '

1550

'--force standby clone {}'.format(

1551

postgresql_cluster_dir,

1552

relation['private-address']),

1553

exit_on_error=False)

1554

except subprocess.CalledProcessError:

1555

# We failed, and this cluster is broken. Rebuild a

1556

# working cluster so start/stop etc. works and we

1557

# can retry hooks again. Even assuming the charm is

1558

# functioning correctly, the clone may still fail

1559

# due to eg. lack of disk space.

1560

juju_log(MSG_ERROR, "Clone failed, db cluster destroyed")

1561

if os.path.exists(postgresql_cluster_dir):

1562

shutil.rmtree(postgresql_cluster_dir)

1563

if os.path.exists(postgresql_config_dir):

1564

shutil.rmtree(postgresql_config_dir)

1565

run('pg_createcluster 9.1 main')

1566

config_changed(postgresql_config)

1567

raise

1568

finally:

1569

postgresql_start()

1570

wait_for_db()

1571

run_repmgr('standby register')

1572

juju_log(MSG_INFO, "Registered cluster with repmgr")

1573

local_state['state'] = 'hot standby'

1574

local_state['following'] = os.environ['JUJU_REMOTE_UNIT']

1575

local_state.publish()

1576

1577

elif local_state['state'] == 'hot standby':

1578

if local_state['following'] != os.environ['JUJU_REMOTE_UNIT']:

1579

juju_log(

1580

MSG_INFO, "New master {} found. Following".format(

1581

os.environ['JUJU_REMOTE_UNIT']))

1582

run_repmgr('standby follow', exit_on_error=True)

1583

local_state['following'] = os.environ['JUJU_REMOTE_UNIT']

1584

local_state.save()

1585

1586

else:

1587

raise AssertionError(

1588

"Unknown state {}".format(local_state['state']))

1589

1590

1591

def replication_relation_broken():

1592

config_changed(postgresql_config)

1593

authorize_remote_ssh()

1594

repmgr_gc()

1595

1596

1597

def slave_count():

1598

num_slaves = 0

1599

for relid in relation_ids(relation_types=replication_relation_types):

1600

num_slaves += len(relation_list(relid))

1601

return num_slaves

1602

1603

1604

def postgresql_is_in_recovery():

1605

cur = db_cursor(autocommit=True)

1606

cur.execute("SELECT pg_is_in_recovery()")

1607

return cur.fetchone()[0]

1608

1609

1610

def postgresql_is_in_backup_mode():

1611

return os.path.exists(

1612

os.path.join(postgresql_cluster_dir, 'backup_label'))

1613

1614

1615

def wait_for_db(timeout=120, db='template1', user='postgres', host=None):

1616

'''Wait until the db is fully up.'''

1617

db_cursor(db=db, user=user, host=host, timeout=timeout)

1618

1619

916

1620

def update_nrpe_checks():

917

1621

config_data = config_get()

918

1622

try:

919

1623

nagios_uid = getpwnam('nagios').pw_uid

920

1624

nagios_gid = getgrnam('nagios').gr_gid

921

1625

except:

922

subprocess.call(['juju-log', "Nagios user not set up. Exiting."])

1626

juju_log(MSG_DEBUG, "Nagios user not set up. Exiting.")

923

1627

return

924

1628

925

1629

unit_name = os.environ['JUJU_UNIT_NAME'].replace('/', '-')

978

1682

###############################################################################

979

1683

config_data = config_get()

980

1684

version = config_data['version']

981

# We need this to evaluate if we're on a version greater than a given number

982

config_data['version_float'] = float(version)

983

1685

cluster_name = config_data['cluster_name']

984

1686

postgresql_data_dir = "/var/lib/postgresql"

985

postgresql_cluster_dir = \

986

"%s/%s/%s" % (postgresql_data_dir, version, cluster_name)

987

postgresql_config_dir = "/etc/postgresql/%s/%s" % (version, cluster_name)

988

postgresql_config = "%s/postgresql.conf" % (postgresql_config_dir,)

989

postgresql_ident = "%s/pg_ident.conf" % (postgresql_config_dir,)

990

postgresql_hba = "%s/pg_hba.conf" % (postgresql_config_dir,)

1687

postgresql_cluster_dir = os.path.join(

1688

postgresql_data_dir, version, cluster_name)

1689

postgresql_bin_dir = os.path.join('/usr/lib/postgresql', version, 'bin')

1690

postgresql_config_dir = os.path.join("/etc/postgresql", version, cluster_name)

1691

postgresql_config = os.path.join(postgresql_config_dir, "postgresql.conf")

1692

postgresql_ident = os.path.join(postgresql_config_dir, "pg_ident.conf")

1693

postgresql_hba = os.path.join(postgresql_config_dir, "pg_hba.conf")

991

1694

postgresql_crontab = "/etc/cron.d/postgresql"

992

1695

postgresql_service_config_dir = "/var/run/postgresql"

993

postgresql_scripts_dir = '{}/scripts'.format(postgresql_data_dir)

994

postgresql_backups_dir = '{}/backups'.format(postgresql_data_dir)

995

postgresql_logs_dir = '{}/logs'.format(postgresql_data_dir)

1696

postgresql_scripts_dir = os.path.join(postgresql_data_dir, 'scripts')

1697

postgresql_backups_dir = os.path.join(postgresql_data_dir, 'backups')

1698

postgresql_logs_dir = os.path.join(postgresql_data_dir, 'logs')

1699

postgres_ssh_dir = os.path.expanduser('~postgres/.ssh')

1700

postgres_ssh_public_key = os.path.join(postgres_ssh_dir, 'id_rsa.pub')

1701

postgres_ssh_private_key = os.path.join(postgres_ssh_dir, 'id_rsa')

1702

postgres_ssh_authorized_keys = os.path.join(postgres_ssh_dir, 'authorized_keys')

1703

postgres_ssh_known_hosts = os.path.join(postgres_ssh_dir, 'known_hosts')

1704

postgres_pgpass = os.path.expanduser('~postgres/.pgpass')

1705

repmgr_config = os.path.expanduser('~postgres/repmgr.conf')

996

1706

hook_name = os.path.basename(sys.argv[0])

1707

replication_relation_types = ['master', 'slave', 'replication']

1708

local_state = State('local_state.pickle')

1709

997

1710

998

1711

###############################################################################

999

1712

# Main section

1000

1713

###############################################################################

1714

juju_log(MSG_INFO, "Running {} hook".format(hook_name))

1001

1715

if hook_name == "install":

1002

1716

install()

1003

1717

#-------- config-changed

1050

1764

db_admin_relation_broken(user)

1051

1765

elif hook_name == "nrpe-external-master-relation-changed":

1052

1766

update_nrpe_checks()

1767

elif hook_name in (

1768

'master-relation-joined', 'master-relation-changed',

1769

'slave-relation-joined', 'slave-relation-changed',

1770

'replication-relation-joined', 'replication-relation-changed'):

1771

replication_relation_changed()

1772

elif hook_name in (

1773

'master-relation-broken', 'slave-relation-broken',

1774

'replication-relation-broken', 'replication-relation-departed'):

1775

replication_relation_broken()

1053

1776

#-------- persistent-storage-relation-joined,

1054

1777

# persistent-storage-relation-changed

1055

1778

#elif hook_name in ["persistent-storage-relation-joined",

1059

1782

#elif hook_name == "persistent-storage-relation-broken":

1060

1783

# persistent_storage_relation_broken()

1061

1784

else:

1062

print "Unknown hook"

1785

print "Unknown hook {}".format(hook_name)

1063

1786

sys.exit(1)

Older »