~ubuntu-branches/ubuntu/precise/haproxy/precise-security

« back to all changes in this revision

Viewing changes to src/stream_sock.c

Committer: Bazaar Package Importer
Author(s): Arnaud Cornet
Date: 2009-06-26 00:11:01 UTC
mfrom: (1.1.6 upstream) (2.1.4 squeeze)
Revision ID: james.westby@ubuntu.com-20090626001101-qo261ke2mjh3d8cn

Tags: 1.3.18-1

http://bugs.debian.org/534583

* New Upstream Version (Closes: #534583).
* Add contrib directory in docs

files added:
contrib/halog

contrib/halog/Makefile

contrib/halog/fgets2-64.c

contrib/halog/fgets2.c

contrib/halog/halog.c

contrib/selinux

contrib/selinux/README

contrib/selinux/haproxy.fc

contrib/selinux/haproxy.if

contrib/selinux/haproxy.te

doc/acl.fig

doc/internals/stream-sock-states.fig

doc/queuing.fig

include/common/ticks.h

include/proto/freq_ctr.h

include/proto/pipe.h

include/proto/signal.h

include/proto/stream_interface.h

include/types/freq_ctr.h

include/types/pipe.h

include/types/proto_tcp.h

include/types/signal.h

include/types/stream_interface.h

src/freq_ctr.c

src/pipe.c

src/signal.c

src/stream_interface.c

tests/test-acl.cfg

tests/test-fsm.cfg

tests/test-inspect-smtp.cfg

tests/test-inspect-ssl.cfg

tests/test-redirect.cfg

tests/test-sched.cfg

files removed:
examples/haproxy-small.spec

include/import/bitops.h

include/import/tree.h

include/proto/senddata.h

include/types/client.h

include/types/polling.h

src/senddata.c

files modified:
CHANGELOG

Makefile

Makefile.bsd

Makefile.osx

README

VERDATE

VERSION

debian/changelog

debian/haproxy.docs

doc/architecture.txt

doc/configuration.txt

examples/haproxy.init

examples/haproxy.spec

include/common/appsession.h

include/common/cfgparse.h

include/common/debug.h

include/common/defaults.h

include/common/eb32tree.h

include/common/eb64tree.h

include/common/ebpttree.h

include/common/ebtree.h

include/common/memory.h

include/common/mini-clist.h

include/common/standard.h

include/common/time.h

include/common/uri_auth.h

include/proto/acl.h

include/proto/backend.h

include/proto/buffers.h

include/proto/checks.h

include/proto/client.h

include/proto/dumpstats.h

include/proto/fd.h

include/proto/log.h

include/proto/proto_http.h

include/proto/proto_tcp.h

include/proto/proto_uxst.h

include/proto/proxy.h

include/proto/server.h

include/proto/session.h

include/proto/stream_sock.h

include/proto/task.h

include/types/acl.h

include/types/backend.h

include/types/buffers.h

include/types/fd.h

include/types/global.h

include/types/proto_http.h

include/types/protocols.h

include/types/proxy.h

include/types/server.h

include/types/session.h

include/types/task.h

src/acl.c

src/appsession.c

src/backend.c

src/buffers.c

src/cfgparse.c

src/checks.c

src/client.c

src/cttproxy.c

src/dumpstats.c

src/eb32tree.c

src/ev_epoll.c

src/ev_kqueue.c

src/ev_poll.c

src/ev_select.c

src/ev_sepoll.c

src/fd.c

src/haproxy.c

src/hdr_idx.c

src/log.c

src/memory.c

src/proto_http.c

src/proto_tcp.c

src/proto_uxst.c

src/protocols.c

src/proxy.c

src/queue.c

src/server.c

src/session.c

src/sessionhash.c

src/standard.c

src/stream_sock.c

src/task.c

src/time.c

src/uri_auth.c

Show diffs side-by-side

added added

removed removed

src/stream_sock.c

* Functions operating on SOCK_STREAM and buffers.

* This program is free software; you can redistribute it and/or

* modify it under the terms of the GNU General Public License

#define _GNU_SOURCE

#include <errno.h>

#include <fcntl.h>

#include <stdio.h>

#include <common/config.h>

#include <common/debug.h>

#include <common/standard.h>

#include <common/ticks.h>

#include <common/time.h>

#include <types/buffers.h>

#include <types/global.h>

#include <types/polling.h>

#include <proto/buffers.h>

#include <proto/client.h>

#include <proto/fd.h>

#include <proto/pipe.h>

#include <proto/stream_sock.h>

#include <proto/task.h>

#include <types/global.h>

/* On recent Linux kernels, the splice() syscall may be used for faster data copy.

* But it's not always defined on some OS versions, and it even happens that some

* definitions are wrong with some glibc due to an offset bug in syscall().

#if defined(CONFIG_HAP_LINUX_SPLICE)

#include <unistd.h>

#include <sys/syscall.h>

#ifndef SPLICE_F_MOVE

#define SPLICE_F_MOVE 0x1

#endif

#ifndef SPLICE_F_NONBLOCK

#define SPLICE_F_NONBLOCK 0x2

#endif

#ifndef SPLICE_F_MORE

#define SPLICE_F_MORE 0x4

#endif

#ifndef __NR_splice

#if defined(__powerpc__) || defined(__powerpc64__)

#define __NR_splice 283

#elif defined(__sparc__) || defined(__sparc64__)

#define __NR_splice 232

#elif defined(__x86_64__)

#define __NR_splice 275

#elif defined(__alpha__)

#define __NR_splice 468

#elif defined (__i386__)

#define __NR_splice 313

#else

#warning unsupported architecture, guessing __NR_splice=313 like x86...

#define __NR_splice 313

#endif /* $arch */

_syscall6(int, splice, int, fdin, loff_t *, off_in, int, fdout, loff_t *, off_out, size_t, len, unsigned long, flags)

#endif /* __NR_splice */

/* A pipe contains 16 segments max, and it's common to see segments of 1448 bytes

* because of timestamps. Use this as a hint for not looping on splice().

#define SPLICE_FULL_HINT 16*1448

/* Returns :

* -1 if splice is not possible or not possible anymore and we must switch to

* user-land copy (eg: to_forward reached)

* 0 when we know that polling is required to get more data (EAGAIN)

* 1 for all other cases (we can safely try again, or if an activity has been

* detected (DATA/NULL/ERR))

* Sets :

* BF_READ_NULL

* BF_READ_PARTIAL

* BF_WRITE_PARTIAL (during copy)

* BF_EMPTY (during copy)

* SI_FL_ERR

* SI_FL_WAIT_ROOM

* (SI_FL_WAIT_RECV)

100

* This function automatically allocates a pipe from the pipe pool. It also

101

* carefully ensures to clear b->pipe whenever it leaves the pipe empty.

102

103

static int stream_sock_splice_in(struct buffer *b, struct stream_interface *si)

104

{

105

int fd = si->fd;

106

int ret, max, total = 0;

107

int retval = 1;

108

109

if (!b->to_forward)

110

return -1;

111

112

if (!(b->flags & BF_KERN_SPLICING))

113

return -1;

114

115

if (b->l) {

116

/* We're embarrassed, there are already data pending in

117

* the buffer and we don't want to have them at two

118

* locations at a time. Let's indicate we need some

119

* place and ask the consumer to hurry.

120

121

si->flags |= SI_FL_WAIT_ROOM;

122

EV_FD_CLR(fd, DIR_RD);

123

b->rex = TICK_ETERNITY;

124

b->cons->chk_snd(b->cons);

125

return 1;

126

}

127

128

if (unlikely(b->pipe == NULL)) {

129

if (pipes_used >= global.maxpipes || !(b->pipe = get_pipe())) {

130

b->flags &= ~BF_KERN_SPLICING;

131

return -1;

132

}

133

}

134

135

/* At this point, b->pipe is valid */

136

137

while (1) {

138

max = b->to_forward;

139

if (max <= 0) {

140

/* It looks like the buffer + the pipe already contain

141

* the maximum amount of data to be transferred. Try to

142

* send those data immediately on the other side if it

143

* is currently waiting.

144

145

retval = -1; /* end of forwarding */

146

break;

147

}

148

149

ret = splice(fd, NULL, b->pipe->prod, NULL, max,

150

SPLICE_F_MOVE|SPLICE_F_NONBLOCK);

151

152

if (ret <= 0) {

153

if (ret == 0) {

154

/* connection closed. This is only detected by

155

* recent kernels (>= 2.6.27.13).

156

157

b->flags |= BF_READ_NULL;

158

retval = 1; /* no need for further polling */

159

break;

160

}

161

162

if (errno == EAGAIN) {

163

/* there are two reasons for EAGAIN :

164

* - nothing in the socket buffer (standard)

165

* - pipe is full

166

* - the connection is closed (kernel < 2.6.27.13)

167

* Since we don't know if pipe is full, we'll

168

* stop if the pipe is not empty. Anyway, we

169

* will almost always fill/empty the pipe.

170

171

172

if (b->pipe->data) {

173

si->flags |= SI_FL_WAIT_ROOM;

174

retval = 1;

175

break;

176

}

177

178

/* We don't know if the connection was closed.

179

* But if we're called upon POLLIN with an empty

180

* pipe and get EAGAIN, it is suspect enought to

181

* try to fall back to the normal recv scheme

182

* which will be able to deal with the situation.

183

184

retval = -1;

185

break;

186

}

187

/* here we have another error */

188

si->flags |= SI_FL_ERR;

189

retval = 1;

190

break;

191

} /* ret <= 0 */

192

193

b->to_forward -= ret;

194

total += ret;

195

b->total += ret;

196

b->pipe->data += ret;

197

b->flags |= BF_READ_PARTIAL;

198

b->flags &= ~BF_EMPTY; /* to prevent shutdowns */

199

200

if (b->pipe->data >= SPLICE_FULL_HINT ||

201

ret >= global.tune.recv_enough) {

202

/* We've read enough of it for this time. */

203

retval = 1;

204

break;

205

}

206

} /* while */

207

208

if (unlikely(!b->pipe->data)) {

209

put_pipe(b->pipe);

210

b->pipe = NULL;

211

}

212

213

return retval;

214

}

215

216

#endif /* CONFIG_HAP_LINUX_SPLICE */

217

218

219

220

* this function is called on a read event from a stream socket.

223

* otherwise.

224

225

int stream_sock_read(int fd) {

__label__ out_eternity, out_wakeup, out_shutdown_r, out_error;

struct buffer *b = fdtab[fd].cb[DIR_RD].b;

int ret, max, retval;

226

struct stream_interface *si = fdtab[fd].owner;

227

struct buffer *b = si->ib;

228

int ret, max, retval, cur_read;

229

int read_poll = MAX_READ_POLL_LOOPS;

230

231

#ifdef DEBUG_FULL

242

if ((fdtab[fd].ev & (FD_POLL_IN|FD_POLL_HUP)) == FD_POLL_HUP)

243

goto out_shutdown_r;

244

245

/* maybe we were called immediately after an asynchronous shutr */

246

if (b->flags & BF_SHUTR)

247

goto out_wakeup;

248

249

#if defined(CONFIG_HAP_LINUX_SPLICE)

250

if (b->to_forward && b->flags & BF_KERN_SPLICING) {

251

252

/* Under Linux, if FD_POLL_HUP is set, we have reached the end.

253

* Since older splice() implementations were buggy and returned

254

* EAGAIN on end of read, let's bypass the call to splice() now.

255

256

if (fdtab[fd].ev & FD_POLL_HUP)

257

goto out_shutdown_r;

258

259

retval = stream_sock_splice_in(b, si);

260

261

if (retval >= 0) {

262

if (si->flags & SI_FL_ERR)

263

goto out_error;

264

if (b->flags & BF_READ_NULL)

265

goto out_shutdown_r;

266

goto out_wakeup;

267

}

268

/* splice not possible (anymore), let's go on on standard copy */

269

}

270

#endif

271

cur_read = 0;

272

while (1) {

273

274

* 1. compute the maximum block size we can read at once.

275

if (b->l == 0) { /* let's realign the buffer to optimize I/O */

b->r = b->w = b->lr = b->data;

max = b->rlim - b->data;

276

if (b->l == 0) {

277

/* let's realign the buffer to optimize I/O */

278

b->r = b->w = b->lr = b->data;

279

max = b->max_len;

280

}

281

else if (b->r > b->w) {

max = b->rlim - b->r;

282

max = b->data + b->max_len - b->r;

283

}

284

else {

285

max = b->w - b->r;

/* FIXME: theorically, if w>0, we shouldn't have rlim < data+size anymore

* since it means that the rewrite protection has been removed. This

* implies that the if statement can be removed.

if (max > b->rlim - b->data)

max = b->rlim - b->data;

286

if (max > b->max_len)

287

max = b->max_len;

288

}

if (unlikely(max == 0)) {

/* Not anymore room to store data. This should theorically

* never happen, but better safe than sorry !

EV_FD_CLR(fd, DIR_RD);

goto out_eternity;

289

290

if (max == 0) {

291

b->flags |= BF_FULL;

292

si->flags |= SI_FL_WAIT_ROOM;

293

break;

294

}

295

296

110

313

if (ret > 0) {

111

314

b->r += ret;

112

315

b->l += ret;

113

b->flags |= BF_PARTIAL_READ;

114

316

cur_read += ret;

317

318

/* if we're allowed to directly forward data, we must update send_max */

319

if (b->to_forward > 0) {

320

int fwd = MIN(b->to_forward, ret);

321

b->send_max += fwd;

322

b->to_forward -= fwd;

323

}

324

325

if (fdtab[fd].state == FD_STCONN)

326

fdtab[fd].state = FD_STREADY;

327

328

b->flags |= BF_READ_PARTIAL;

329

b->flags &= ~BF_EMPTY;

330

115

331

if (b->r == b->data + BUFSIZE) {

116

332

b->r = b->data; /* wrap around the buffer */

117

333

}

118

334

119

335

b->total += ret;

120

336

121

if (b->l == b->rlim - b->data) {

337

if (b->l >= b->max_len) {

122

338

/* The buffer is now full, there's no point in going through

123

339

* the loop again.

124

340

125

EV_FD_CLR(fd, DIR_RD);

126

goto out_eternity;

341

if (!(b->flags & BF_STREAMER_FAST) && (cur_read == b->l)) {

342

b->xfer_small = 0;

343

b->xfer_large++;

344

if (b->xfer_large >= 3) {

345

/* we call this buffer a fast streamer if it manages

346

* to be filled in one call 3 consecutive times.

347

348

b->flags |= (BF_STREAMER | BF_STREAMER_FAST);

349

//fputc('+', stderr);

350

}

351

}

352

else if ((b->flags & (BF_STREAMER | BF_STREAMER_FAST)) &&

353

(cur_read <= BUFSIZE / 2)) {

354

b->xfer_large = 0;

355

b->xfer_small++;

356

if (b->xfer_small >= 2) {

357

/* if the buffer has been at least half full twice,

358

* we receive faster than we send, so at least it

359

* is not a "fast streamer".

360

361

b->flags &= ~BF_STREAMER_FAST;

362

//fputc('-', stderr);

363

}

364

}

365

else {

366

b->xfer_small = 0;

367

b->xfer_large = 0;

368

}

369

370

b->flags |= BF_FULL;

371

si->flags |= SI_FL_WAIT_ROOM;

372

break;

127

373

}

128

374

129

375

/* if too many bytes were missing from last read, it means that

133

379

* is closed.

134

380

135

381

if (ret < max) {

382

if ((b->flags & (BF_STREAMER | BF_STREAMER_FAST)) &&

383

(cur_read <= BUFSIZE / 2)) {

384

b->xfer_large = 0;

385

b->xfer_small++;

386

if (b->xfer_small >= 3) {

387

/* we have read less than half of the buffer in

388

* one pass, and this happened at least 3 times.

389

* This is definitely not a streamer.

390

391

b->flags &= ~(BF_STREAMER | BF_STREAMER_FAST);

392

//fputc('!', stderr);

393

}

394

}

395

/* unfortunately, on level-triggered events, POLL_HUP

396

* is generally delivered AFTER the system buffer is

397

* empty, so this one might never match.

398

136

399

if (fdtab[fd].ev & FD_POLL_HUP)

137

400

goto out_shutdown_r;

138

break;

401

402

/* if a streamer has read few data, it may be because we

403

* have exhausted system buffers. It's not worth trying

404

* again.

405

406

if (b->flags & BF_STREAMER)

407

break;

408

409

/* generally if we read something smaller than 1 or 2 MSS,

410

* it means that either we have exhausted the system's

411

* buffers (streamer or question-response protocol) or

412

* that the connection will be closed. Streamers are

413

* easily detected so we return early. For other cases,

414

* it's still better to perform a last read to be sure,

415

* because it may save one complete poll/read/wakeup cycle

416

* in case of shutdown.

417

418

if (ret < MIN_RET_FOR_READ_LOOP && b->flags & BF_STREAMER)

419

break;

420

421

/* if we read a large block smaller than what we requested,

422

* it's almost certain we'll never get anything more.

423

424

if (ret >= global.tune.recv_enough)

425

break;

139

426

}

140

427

141

/* generally if we read something smaller than 1 or 2 MSS,

142

* it means that it's not worth trying to read again. It may

143

* also happen on headers, but the application then can stop

144

* reading before we start polling.

145

146

if (ret < MIN_RET_FOR_READ_LOOP)

147

break;

148

149

if (--read_poll <= 0)

150

break;

151

428

if ((b->flags & BF_READ_DONTWAIT) || --read_poll <= 0)

429

break;

152

430

}

153

431

else if (ret == 0) {

154

432

/* connection closed */

156

434

}

157

435

else if (errno == EAGAIN) {

158

436

/* Ignore EAGAIN but inform the poller that there is

159

* nothing to read left. But we may have done some work

160

* justifying to notify the task.

437

* nothing to read left if we did not read much, ie

438

* less than what we were still expecting to read.

439

* But we may have done some work justifying to notify

440

* the task.

161

441

162

retval = 0;

442

if (cur_read < MIN_RET_FOR_READ_LOOP)

443

retval = 0;

163

444

break;

164

445

}

165

446

else {

167

448

}

168

449

} /* while (1) */

169

450

170

171

* The only way to get out of this loop is to have stopped reading

172

* without any error nor close, either by limiting the number of

173

* loops, or because of an EAGAIN. We only rearm the timer if we

174

* have at least read something.

451

out_wakeup:

452

/* We might have some data the consumer is waiting for */

453

if ((b->send_max || b->pipe) && (b->cons->flags & SI_FL_WAIT_DATA)) {

454

int last_len = b->pipe ? b->pipe->data : 0;

455

456

b->cons->chk_snd(b->cons);

457

458

/* check if the consumer has freed some space */

459

if (!(b->flags & BF_FULL) &&

460

(!last_len || !b->pipe || b->pipe->data < last_len))

461

si->flags &= ~SI_FL_WAIT_ROOM;

462

}

463

464

if (si->flags & SI_FL_WAIT_ROOM) {

465

EV_FD_CLR(fd, DIR_RD);

466

b->rex = TICK_ETERNITY;

467

}

468

else if ((b->flags & (BF_SHUTR|BF_READ_PARTIAL|BF_FULL|BF_READ_NOEXP)) == BF_READ_PARTIAL)

469

b->rex = tick_add_ifset(now_ms, b->rto);

470

471

/* we have to wake up if there is a special event or if we don't have

472

* any more data to forward.

175

473

176

177

if (b->flags & BF_PARTIAL_READ && tv_isset(&b->rex)) {

178

if (tv_add_ifset(&b->rex, &now, &b->rto))

179

goto out_wakeup;

180

out_eternity:

181

tv_eternity(&b->rex);

182

}

183

184

out_wakeup:

185

if (b->flags & BF_READ_STATUS)

186

task_wakeup(fdtab[fd].owner);

474

if ((b->flags & (BF_READ_NULL|BF_READ_ERROR|BF_SHUTR|BF_READ_DONTWAIT)) ||

475

!b->to_forward ||

476

si->state != SI_ST_EST ||

477

b->cons->state != SI_ST_EST ||

478

(si->flags & SI_FL_ERR))

479

task_wakeup(si->owner, TASK_WOKEN_IO);

480

481

b->flags &= ~BF_READ_DONTWAIT;

187

482

fdtab[fd].ev &= ~FD_POLL_IN;

188

483

return retval;

189

484

190

485

out_shutdown_r:

486

/* we received a shutdown */

191

487

fdtab[fd].ev &= ~FD_POLL_HUP;

192

488

b->flags |= BF_READ_NULL;

193

goto out_eternity;

489

stream_sock_shutr(si);

490

goto out_wakeup;

194

491

195

492

out_error:

196

/* There was an error. we must wakeup the task. No need to clear

197

* the events, the task will do it.

493

/* Read error on the file descriptor. We mark the FD as STERROR so

494

* that we don't use it anymore. The error is reported to the stream

495

* interface which will take proper action. We must not perturbate the

496

* buffer because the stream interface wants to ensure transparent

497

* connection retries.

198

498

499

199

500

fdtab[fd].state = FD_STERROR;

200

501

fdtab[fd].ev &= ~FD_POLL_STICKY;

201

b->flags |= BF_READ_ERROR;

202

goto out_eternity;

502

EV_FD_REM(fd);

503

si->flags |= SI_FL_ERR;

504

retval = 1;

505

goto out_wakeup;

203

506

}

204

507

205

508

206

509

207

* this function is called on a write event from a stream socket.

208

* It returns 0 if we have a high confidence that we will not be

209

* able to write more data without polling first. Returns non-zero

210

* otherwise.

510

* This function is called to send buffer data to a stream socket.

511

* It returns -1 in case of unrecoverable error, 0 if the caller needs to poll

512

* before calling it again, otherwise 1. If a pipe was associated with the

513

* buffer and it empties it, it releases it as well.

211

514

212

int stream_sock_write(int fd) {

213

__label__ out_eternity, out_wakeup, out_error;

214

struct buffer *b = fdtab[fd].cb[DIR_WR].b;

215

int ret, max, retval;

515

static int stream_sock_write_loop(struct stream_interface *si, struct buffer *b)

516

{

216

517

int write_poll = MAX_WRITE_POLL_LOOPS;

217

218

#ifdef DEBUG_FULL

219

fprintf(stderr,"stream_sock_write : fd=%d, owner=%p\n", fd, fdtab[fd].owner);

518

int retval = 1;

519

int ret, max;

520

521

#if defined(CONFIG_HAP_LINUX_SPLICE)

522

while (b->pipe) {

523

ret = splice(b->pipe->cons, NULL, si->fd, NULL, b->pipe->data,

524

SPLICE_F_MOVE|SPLICE_F_NONBLOCK);

525

if (ret <= 0) {

526

if (ret == 0 || errno == EAGAIN) {

527

retval = 0;

528

return retval;

529

}

530

/* here we have another error */

531

retval = -1;

532

return retval;

533

}

534

535

b->flags |= BF_WRITE_PARTIAL;

536

b->pipe->data -= ret;

537

538

if (!b->pipe->data) {

539

put_pipe(b->pipe);

540

b->pipe = NULL;

541

break;

542

}

543

544

if (--write_poll <= 0)

545

return retval;

546

}

547

548

/* At this point, the pipe is empty, but we may still have data pending

549

* in the normal buffer.

550

551

if (!b->l) {

552

b->flags |= BF_EMPTY;

553

return retval;

554

}

220

555

#endif

221

222

retval = 1;

223

if (fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))

224

goto out_error;

225

556

if (!b->send_max)

557

return retval;

558

559

/* when we're in this loop, we already know that there is no spliced

560

* data left, and that there are sendable buffered data.

561

226

562

while (1) {

227

if (b->l == 0) { /* let's realign the buffer to optimize I/O */

228

b->r = b->w = b->lr = b->data;

229

max = 0;

230

}

231

else if (b->r > b->w) {

563

if (b->r > b->w)

232

564

max = b->r - b->w;

233

}

234

else {

565

else

235

566

max = b->data + BUFSIZE - b->w;

236

}

237

238

if (max == 0) {

239

/* may be we have received a connection acknowledgement in TCP mode without data */

240

if (likely(fdtab[fd].state == FD_STCONN)) {

241

/* We have no data to send to check the connection, and

242

* getsockopt() will not inform us whether the connection

243

* is still pending. So we'll reuse connect() to check the

244

* state of the socket. This has the advantage of givig us

245

* the following info :

246

* - error

247

* - connecting (EALREADY, EINPROGRESS)

248

* - connected (EISCONN, 0)

249

250

if ((connect(fd, fdtab[fd].peeraddr, fdtab[fd].peerlen) == 0))

251

errno = 0;

252

253

if (errno == EALREADY || errno == EINPROGRESS) {

254

retval = 0;

255

goto out_wakeup;

256

}

257

258

if (errno && errno != EISCONN)

259

goto out_error;

260

261

/* OK we just need to indicate that we got a connection

262

* and that we wrote nothing.

263

264

b->flags |= BF_WRITE_NULL;

265

fdtab[fd].state = FD_STREADY;

266

}

267

268

/* Funny, we were called to write something but there wasn't

269

* anything. Theorically we cannot get there, but just in case,

270

* let's disable the write event and pretend we never came there.

271

272

EV_FD_CLR(fd, DIR_WR);

273

goto out_eternity;

274

}

567

568

/* limit the amount of outgoing data if required */

569

if (max > b->send_max)

570

max = b->send_max;

275

571

276

572

#ifndef MSG_NOSIGNAL

277

573

{

278

574

int skerr;

279

575

socklen_t lskerr = sizeof(skerr);

280

576

281

ret = getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);

577

ret = getsockopt(si->fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);

282

578

if (ret == -1 || skerr)

283

579

ret = -1;

284

580

else

285

ret = send(fd, b->w, max, MSG_DONTWAIT);

581

ret = send(si->fd, b->w, max, MSG_DONTWAIT);

286

582

}

287

583

#else

288

ret = send(fd, b->w, max, MSG_DONTWAIT | MSG_NOSIGNAL);

584

ret = send(si->fd, b->w, max, MSG_DONTWAIT | MSG_NOSIGNAL);

289

585

#endif

290

586

291

587

if (ret > 0) {

292

b->l -= ret;

588

if (fdtab[si->fd].state == FD_STCONN)

589

fdtab[si->fd].state = FD_STREADY;

590

591

b->flags |= BF_WRITE_PARTIAL;

592

293

593

b->w += ret;

294

295

b->flags |= BF_PARTIAL_WRITE;

296

297

if (b->w == b->data + BUFSIZE) {

594

if (b->w == b->data + BUFSIZE)

298

595

b->w = b->data; /* wrap around the buffer */

299

}

300

301

if (!b->l) {

302

EV_FD_CLR(fd, DIR_WR);

303

goto out_eternity;

304

}

596

597

b->l -= ret;

598

if (likely(b->l < b->max_len))

599

b->flags &= ~BF_FULL;

600

601

if (likely(!b->l)) {

602

/* optimize data alignment in the buffer */

603

b->r = b->w = b->lr = b->data;

604

if (likely(!b->pipe))

605

b->flags |= BF_EMPTY;

606

}

607

608

b->send_max -= ret;

609

if (!b->send_max || !b->l)

610

break;

305

611

306

612

/* if the system buffer is full, don't insist */

307

613

if (ret < max)

311

617

break;

312

618

}

313

619

else if (ret == 0 || errno == EAGAIN) {

314

/* nothing written, just pretend we were never called

315

* and wait for the socket to be ready. But we may have

316

* done some work justifying to notify the task.

317

620

/* nothing written, we need to poll for write first */

318

621

retval = 0;

319

622

break;

320

623

}

321

624

else {

625

/* bad, we got an error */

626

retval = -1;

627

break;

628

}

629

} /* while (1) */

630

631

return retval;

632

}

633

634

635

636

* This function is called on a write event from a stream socket.

637

* It returns 0 if the caller needs to poll before calling it again, otherwise

638

* non-zero.

639

640

int stream_sock_write(int fd)

641

{

642

struct stream_interface *si = fdtab[fd].owner;

643

struct buffer *b = si->ob;

644

int retval = 1;

645

646

#ifdef DEBUG_FULL

647

fprintf(stderr,"stream_sock_write : fd=%d, owner=%p\n", fd, fdtab[fd].owner);

648

#endif

649

650

retval = 1;

651

if (fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))

652

goto out_error;

653

654

/* we might have been called just after an asynchronous shutw */

655

if (b->flags & BF_SHUTW)

656

goto out_wakeup;

657

658

if (likely(!(b->flags & BF_EMPTY))) {

659

/* OK there are data waiting to be sent */

660

retval = stream_sock_write_loop(si, b);

661

if (retval < 0)

322

662

goto out_error;

663

}

664

else {

665

/* may be we have received a connection acknowledgement in TCP mode without data */

666

if (likely(fdtab[fd].state == FD_STCONN)) {

667

/* We have no data to send to check the connection, and

668

* getsockopt() will not inform us whether the connection

669

* is still pending. So we'll reuse connect() to check the

670

* state of the socket. This has the advantage of givig us

671

* the following info :

672

* - error

673

* - connecting (EALREADY, EINPROGRESS)

674

* - connected (EISCONN, 0)

675

676

if ((connect(fd, fdtab[fd].peeraddr, fdtab[fd].peerlen) == 0))

677

errno = 0;

678

679

if (errno == EALREADY || errno == EINPROGRESS) {

680

retval = 0;

681

goto out_may_wakeup;

682

}

683

684

if (errno && errno != EISCONN)

685

goto out_error;

686

687

/* OK we just need to indicate that we got a connection

688

* and that we wrote nothing.

689

690

b->flags |= BF_WRITE_NULL;

691

fdtab[fd].state = FD_STREADY;

323

692

}

324

} /* while (1) */

325

326

327

* The only way to get out of this loop is to have stopped writing

328

* without any error, either by limiting the number of loops, or

329

* because of an EAGAIN. We only rearm the timer if we have at least

330

* written something.

331

332

333

if (b->flags & BF_PARTIAL_WRITE && tv_isset(&b->wex)) {

334

if (tv_add_ifset(&b->wex, &now, &b->wto)) {

335

/* FIXME: to prevent the client from expiring read timeouts during writes,

336

* we refresh it. A solution would be to merge read+write timeouts into a

337

* unique one, although that needs some study particularly on full-duplex

338

* TCP connections. */

339

if (!(b->flags & BF_SHUTR_STATUS) && tv_isset(&b->rex))

340

b->rex = b->wex;

693

694

/* Funny, we were called to write something but there wasn't

695

* anything. We can get there, for example if we were woken up

696

* on a write event to finish the splice, but the send_max is 0

697

* so we cannot write anything from the buffer. Let's disable

698

* the write event and pretend we never came there.

699

700

}

701

702

if (!b->pipe && !b->send_max) {

703

/* the connection is established but we can't write. Either the

704

* buffer is empty, or we just refrain from sending because the

705

* send_max limit was reached. Maybe we just wrote the last

706

* chunk and need to close.

707

708

if (((b->flags & (BF_SHUTW|BF_EMPTY|BF_HIJACK|BF_WRITE_ENA|BF_SHUTR)) ==

709

(BF_EMPTY|BF_WRITE_ENA|BF_SHUTR)) &&

710

(si->state == SI_ST_EST)) {

711

stream_sock_shutw(si);

341

712

goto out_wakeup;

342

713

}

343

out_eternity:

344

tv_eternity(&b->wex);

345

}

346

347

out_wakeup:

348

if (b->flags & BF_WRITE_STATUS)

349

task_wakeup(fdtab[fd].owner);

714

715

if ((b->flags & (BF_EMPTY|BF_SHUTW)) == BF_EMPTY)

716

si->flags |= SI_FL_WAIT_DATA;

717

718

EV_FD_CLR(fd, DIR_WR);

719

b->wex = TICK_ETERNITY;

720

}

721

722

out_may_wakeup:

723

if (b->flags & BF_WRITE_ACTIVITY) {

724

/* update timeout if we have written something */

725

if ((b->send_max || b->pipe) &&

726

(b->flags & (BF_SHUTW|BF_WRITE_PARTIAL)) == BF_WRITE_PARTIAL)

727

b->wex = tick_add_ifset(now_ms, b->wto);

728

729

out_wakeup:

730

if (tick_isset(si->ib->rex)) {

731

/* Note: to prevent the client from expiring read timeouts

732

* during writes, we refresh it. A better solution would be

733

* to merge read+write timeouts into a unique one, although

734

* that needs some study particularly on full-duplex TCP

735

* connections.

736

737

si->ib->rex = tick_add_ifset(now_ms, si->ib->rto);

738

}

739

740

/* the producer might be waiting for more room to store data */

741

if (likely((b->flags & (BF_SHUTW|BF_WRITE_PARTIAL|BF_FULL)) == BF_WRITE_PARTIAL &&

742

(b->prod->flags & SI_FL_WAIT_ROOM)))

743

b->prod->chk_rcv(b->prod);

744

745

/* we have to wake up if there is a special event or if we don't have

746

* any more data to forward and it's not planned to send any more.

747

748

if (likely((b->flags & (BF_WRITE_NULL|BF_WRITE_ERROR|BF_SHUTW)) ||

749

(!b->to_forward && !b->send_max && !b->pipe) ||

750

si->state != SI_ST_EST ||

751

b->prod->state != SI_ST_EST))

752

task_wakeup(si->owner, TASK_WOKEN_IO);

753

}

754

350

755

fdtab[fd].ev &= ~FD_POLL_OUT;

351

756

return retval;

352

757

353

758

out_error:

354

/* There was an error. we must wakeup the task. No need to clear

355

* the events, the task will do it.

759

/* Write error on the file descriptor. We mark the FD as STERROR so

760

* that we don't use it anymore. The error is reported to the stream

761

* interface which will take proper action. We must not perturbate the

762

* buffer because the stream interface wants to ensure transparent

763

* connection retries.

356

764

765

357

766

fdtab[fd].state = FD_STERROR;

358

767

fdtab[fd].ev &= ~FD_POLL_STICKY;

359

b->flags |= BF_WRITE_ERROR;

360

goto out_eternity;

361

362

363

}

364

768

EV_FD_REM(fd);

769

si->flags |= SI_FL_ERR;

770

task_wakeup(si->owner, TASK_WOKEN_IO);

771

return 1;

772

}

773

774

775

* This function performs a shutdown-write on a stream interface in a connected or

776

* init state (it does nothing for other states). It either shuts the write side

777

* or closes the file descriptor and marks itself as closed. The buffer flags are

778

* updated to reflect the new state.

779

780

void stream_sock_shutw(struct stream_interface *si)

781

{

782

if (si->ob->flags & BF_SHUTW)

783

return;

784

si->ob->flags |= BF_SHUTW;

785

si->ob->wex = TICK_ETERNITY;

786

si->flags &= ~SI_FL_WAIT_DATA;

787

788

switch (si->state) {

789

case SI_ST_EST:

790

if (!(si->ib->flags & BF_SHUTR)) {

791

EV_FD_CLR(si->fd, DIR_WR);

792

shutdown(si->fd, SHUT_WR);

793

return;

794

}

795

/* fall through */

796

case SI_ST_CON:

797

/* we may have to close a pending connection, and mark the

798

* response buffer as shutr

799

800

fd_delete(si->fd);

801

/* fall through */

802

case SI_ST_CER:

803

si->state = SI_ST_DIS;

804

default:

805

si->flags &= ~SI_FL_WAIT_ROOM;

806

si->ib->flags |= BF_SHUTR;

807

si->ib->rex = TICK_ETERNITY;

808

si->exp = TICK_ETERNITY;

809

return;

810

}

811

}

812

813

814

* This function performs a shutdown-read on a stream interface in a connected or

815

* init state (it does nothing for other states). It either shuts the read side

816

* or closes the file descriptor and marks itself as closed. The buffer flags are

817

* updated to reflect the new state.

818

819

void stream_sock_shutr(struct stream_interface *si)

820

{

821

if (si->ib->flags & BF_SHUTR)

822

return;

823

si->ib->flags |= BF_SHUTR;

824

si->ib->rex = TICK_ETERNITY;

825

si->flags &= ~SI_FL_WAIT_ROOM;

826

827

if (si->state != SI_ST_EST && si->state != SI_ST_CON)

828

return;

829

830

if (si->ob->flags & BF_SHUTW) {

831

fd_delete(si->fd);

832

si->state = SI_ST_DIS;

833

si->exp = TICK_ETERNITY;

834

return;

835

}

836

EV_FD_CLR(si->fd, DIR_RD);

837

return;

838

}

839

840

841

* Updates a connected stream_sock file descriptor status and timeouts

842

* according to the buffers' flags. It should only be called once after the

843

* buffer flags have settled down, and before they are cleared. It doesn't

844

* harm to call it as often as desired (it just slightly hurts performance).

845

846

void stream_sock_data_finish(struct stream_interface *si)

847

{

848

struct buffer *ib = si->ib;

849

struct buffer *ob = si->ob;

850

int fd = si->fd;

851

852

DPRINTF(stderr,"[%u] %s: fd=%d owner=%p ib=%p, ob=%p, exp(r,w)=%u,%u ibf=%08x obf=%08x ibl=%d obl=%d si=%d\n",

853

now_ms, __FUNCTION__,

854

fd, fdtab[fd].owner,

855

ib, ob,

856

ib->rex, ob->wex,

857

ib->flags, ob->flags,

858

ib->l, ob->l, si->state);

859

860

/* Check if we need to close the read side */

861

if (!(ib->flags & BF_SHUTR)) {

862

/* Read not closed, update FD status and timeout for reads */

863

if (ib->flags & (BF_FULL|BF_HIJACK)) {

864

/* stop reading */

865

if ((ib->flags & (BF_FULL|BF_HIJACK)) == BF_FULL)

866

si->flags |= SI_FL_WAIT_ROOM;

867

EV_FD_COND_C(fd, DIR_RD);

868

ib->rex = TICK_ETERNITY;

869

}

870

else {

871

/* (re)start reading and update timeout. Note: we don't recompute the timeout

872

* everytime we get here, otherwise it would risk never to expire. We only

873

* update it if is was not yet set, or if we already got some read status.

874

875

si->flags &= ~SI_FL_WAIT_ROOM;

876

EV_FD_COND_S(fd, DIR_RD);

877

if (!(ib->flags & BF_READ_NOEXP) &&

878

(!tick_isset(ib->rex) || ib->flags & BF_READ_ACTIVITY))

879

ib->rex = tick_add_ifset(now_ms, ib->rto);

880

}

881

}

882

883

/* Check if we need to close the write side */

884

if (!(ob->flags & BF_SHUTW)) {

885

/* Write not closed, update FD status and timeout for writes */

886

if ((ob->send_max == 0 && !ob->pipe) ||

887

(ob->flags & BF_EMPTY) ||

888

(ob->flags & (BF_HIJACK|BF_WRITE_ENA)) == 0) {

889

/* stop writing */

890

if ((ob->flags & (BF_EMPTY|BF_HIJACK|BF_WRITE_ENA)) == (BF_EMPTY|BF_WRITE_ENA))

891

si->flags |= SI_FL_WAIT_DATA;

892

EV_FD_COND_C(fd, DIR_WR);

893

ob->wex = TICK_ETERNITY;

894

}

895

else {

896

/* (re)start writing and update timeout. Note: we don't recompute the timeout

897

* everytime we get here, otherwise it would risk never to expire. We only

898

* update it if is was not yet set, or if we already got some write status.

899

900

si->flags &= ~SI_FL_WAIT_DATA;

901

EV_FD_COND_S(fd, DIR_WR);

902

if (!tick_isset(ob->wex) || ob->flags & BF_WRITE_ACTIVITY) {

903

ob->wex = tick_add_ifset(now_ms, ob->wto);

904

if (tick_isset(ib->rex)) {

905

/* Note: depending on the protocol, we don't know if we're waiting

906

* for incoming data or not. So in order to prevent the socket from

907

* expiring read timeouts during writes, we refresh the read timeout,

908

* except if it was already infinite.

909

910

ib->rex = tick_add_ifset(now_ms, ib->rto);

911

}

912

}

913

}

914

}

915

}

916

917

/* This function is used for inter-stream-interface calls. It is called by the

918

* consumer to inform the producer side that it may be interested in checking

919

* for free space in the buffer. Note that it intentionally does not update

920

* timeouts, so that we can still check them later at wake-up.

921

922

void stream_sock_chk_rcv(struct stream_interface *si)

923

{

924

struct buffer *ib = si->ib;

925

926

DPRINTF(stderr,"[%u] %s: fd=%d owner=%p ib=%p, ob=%p, exp(r,w)=%u,%u ibf=%08x obf=%08x ibl=%d obl=%d si=%d\n",

927

now_ms, __FUNCTION__,

928

si->fd, fdtab[si->fd].owner,

929

ib, si->ob,

930

ib->rex, si->ob->wex,

931

ib->flags, si->ob->flags,

932

ib->l, si->ob->l, si->state);

933

934

if (unlikely(si->state != SI_ST_EST || (ib->flags & BF_SHUTR)))

935

return;

936

937

if (ib->flags & (BF_FULL|BF_HIJACK)) {

938

/* stop reading */

939

if ((ib->flags & (BF_FULL|BF_HIJACK)) == BF_FULL)

940

si->flags |= SI_FL_WAIT_ROOM;

941

EV_FD_COND_C(si->fd, DIR_RD);

942

}

943

else {

944

/* (re)start reading */

945

si->flags &= ~SI_FL_WAIT_ROOM;

946

EV_FD_COND_S(si->fd, DIR_RD);

947

}

948

}

949

950

951

/* This function is used for inter-stream-interface calls. It is called by the

952

* producer to inform the consumer side that it may be interested in checking

953

* for data in the buffer. Note that it intentionally does not update timeouts,

954

* so that we can still check them later at wake-up.

955

956

void stream_sock_chk_snd(struct stream_interface *si)

957

{

958

struct buffer *ob = si->ob;

959

int retval;

960

961

DPRINTF(stderr,"[%u] %s: fd=%d owner=%p ib=%p, ob=%p, exp(r,w)=%u,%u ibf=%08x obf=%08x ibl=%d obl=%d si=%d\n",

962

now_ms, __FUNCTION__,

963

si->fd, fdtab[si->fd].owner,

964

si->ib, ob,

965

si->ib->rex, ob->wex,

966

si->ib->flags, ob->flags,

967

si->ib->l, ob->l, si->state);

968

969

if (unlikely(si->state != SI_ST_EST || (ob->flags & BF_SHUTW)))

970

return;

971

972

if (!(si->flags & SI_FL_WAIT_DATA) || /* not waiting for data */

973

(fdtab[si->fd].ev & FD_POLL_OUT) || /* we'll be called anyway */

974

!(ob->send_max || ob->pipe) || /* called with nothing to send ! */

975

!(ob->flags & (BF_HIJACK|BF_WRITE_ENA))) /* we may not write */

976

return;

977

978

retval = stream_sock_write_loop(si, ob);

979

if (retval < 0) {

980

/* Write error on the file descriptor. We mark the FD as STERROR so

981

* that we don't use it anymore and we notify the task.

982

983

fdtab[si->fd].state = FD_STERROR;

984

fdtab[si->fd].ev &= ~FD_POLL_STICKY;

985

EV_FD_REM(si->fd);

986

si->flags |= SI_FL_ERR;

987

goto out_wakeup;

988

}

989

990

if (retval > 0 || (ob->send_max == 0 && !ob->pipe)) {

991

/* the connection is established but we can't write. Either the

992

* buffer is empty, or we just refrain from sending because the

993

* send_max limit was reached. Maybe we just wrote the last

994

* chunk and need to close.

995

996

if (((ob->flags & (BF_SHUTW|BF_EMPTY|BF_HIJACK|BF_WRITE_ENA|BF_SHUTR)) ==

997

(BF_EMPTY|BF_WRITE_ENA|BF_SHUTR)) &&

998

(si->state == SI_ST_EST)) {

999

stream_sock_shutw(si);

1000

goto out_wakeup;

1001

}

1002

1003

if ((ob->flags & (BF_SHUTW|BF_EMPTY|BF_HIJACK|BF_WRITE_ENA)) == (BF_EMPTY|BF_WRITE_ENA))

1004

si->flags |= SI_FL_WAIT_DATA;

1005

ob->wex = TICK_ETERNITY;

1006

}

1007

else {

1008

/* (re)start writing. */

1009

si->flags &= ~SI_FL_WAIT_DATA;

1010

EV_FD_COND_S(si->fd, DIR_WR);

1011

}

1012

1013

if (likely(ob->flags & BF_WRITE_ACTIVITY)) {

1014

/* update timeout if we have written something */

1015

if ((ob->send_max || ob->pipe) &&

1016

(ob->flags & (BF_SHUTW|BF_WRITE_PARTIAL)) == BF_WRITE_PARTIAL)

1017

ob->wex = tick_add_ifset(now_ms, ob->wto);

1018

1019

if (tick_isset(si->ib->rex)) {

1020

/* Note: to prevent the client from expiring read timeouts

1021

* during writes, we refresh it. A better solution would be

1022

* to merge read+write timeouts into a unique one, although

1023

* that needs some study particularly on full-duplex TCP

1024

* connections.

1025

1026

si->ib->rex = tick_add_ifset(now_ms, si->ib->rto);

1027

}

1028

}

1029

1030

/* in case of special condition (error, shutdown, end of write...), we

1031

* have to notify the task.

1032

1033

if (likely((ob->flags & (BF_WRITE_NULL|BF_WRITE_ERROR|BF_SHUTW)) ||

1034

(!ob->to_forward && !ob->send_max && !ob->pipe) ||

1035

si->state != SI_ST_EST)) {

1036

out_wakeup:

1037

task_wakeup(si->owner, TASK_WOKEN_IO);

1038

}

1039

}

365

1040

366

1041

367

1042

Older »