~ubuntu-branches/ubuntu/gutsy/edbrowse/gutsy

Viewing changes to format.c

Committer: Bazaar Package Importer
Author(s): Kapil Hari Paranjape
Date: 2006-10-20 10:47:30 UTC
mfrom: (1.1.1 upstream)
Revision ID: james.westby@ubuntu.com-20061020104730-o7vxbrypwaz932dt

Tags: 3.1.2-1

http://bugs.debian.org/306486

* New upstream version (3.1.2). Closes: #306486.
  - programs now written in C
  - support for javascript.
* debian/control:
  - added Kapil Hari Paranjape to Uploaders.
  - Build-depends on "libssl-dev", "libmozjs-dev", "libpcre3-dev".
  - Standards-Version to 3.7.2. No changes required.
* debian/rules:
  - add "noopt" feature.
  - set CFLAGS and LIBS.
  - Put $(MAKE) into the build rules.
* debian/copyright: Edited to add the current copyright which
  is GPL with the exception for linking with OpenSSL.
* debian/docs: added "README".
* debian/examples: added "jsrt".

files added:
README

auth.c

buffers.c

cookies.c

eb.h

eb.p

fetchmail.c

format.c

html.c

http.c

jsdom.c

jsloc.c

jsrt

main.c

makefile

makefile.bsd

sendmail.c

ssl-certs

stringfile.c

tcp.c

tcp.h

todo

url.c

files removed:
edbrowse

files modified:
debian/changelog

debian/control

debian/copyright

debian/docs

debian/examples

debian/rules

edbdoc.html

sample.ebrc

Show diffs side-by-side

added added

removed removed

format.c

/* format.c

* Format text, establish line breaks, manage whitespace.

* Copyright (c) Karl Dahlke, 2006

* This file is part of the edbrowse project, released under GPL.

#include "eb.h"

/*********************************************************************

Prepare html for text processing.

Change nulls to spaces.

Make sure it doesn't already contain my magic code,

The one I use to indicate a tag.

If it does, well, change them to something else.

I can only hope this doesn't screw up some embedded javascript.

*********************************************************************/

void

prepareForBrowse(char *h, int h_len)

{

int i, j;

for(i = j = 0; i < h_len; ++i) {

if(h[i] == 0)

h[i] = ' ';

if(h[i] == '\b') {

if(i && !strchr("\n\b<>'\"&", h[i - 1]))

--j;

continue;

}

if(h[i] == (char)0xe2 && i < h_len - 1 && h[i + 1] == (char)0x80) {

++i;

continue;

}

if(h[i] == InternalCodeChar)

h[i] = InternalCodeCharAlternate;

h[j++] = h[i];

}

h[j] = 0; /* now it's a string */

/* undos the file */

for(i = j = 0; h[i]; ++i) {

if(h[i] == '\r' && h[i + 1] == '\n')

continue;

h[j++] = h[i];

}

h[j] = 0;

} /* prepareForBrowse */

/*********************************************************************

Skip past an html comment.

Parse an html tag <tag foo=bar>

*********************************************************************/

const char *

skipHtmlComment(const char *h, int *lines)

{

int lns = 0;

bool comm = h[2] == '-' && h[3] == '-';

bool php = memEqualCI(h + 1, "?php", 4);

h += comm ? 4 : 2;

while(*h) {

if(php) { /* special type of comment */

if(*h == '?' && h[1] == '>') {

h += 2;

goto done;

}

++h;

continue;

}

if(!comm && *h == '>') {

++h;

goto done;

}

if(comm && h[0] == '-' && h[1] == '-') {

h += 2;

while(*h == '-')

h++;

while(isspaceByte(*h)) {

if(*h == '\n')

++lns;

h++;

}

if(!*h)

goto done;

if(*h == '>') {

++h;

goto done;

}

continue;

}

if(*h == '\n')

++lns;

100

h++;

101

}

102

103

done:

104

if(lines)

105

*lines = lns;

106

return h;

107

} /* skipHtmlComment */

108

109

/* an attribute character */

110

static bool

111

atchr(char c)

112

{

113

return (c > ' ' && c != '=' && c != '<' && c != '>');

114

} /* atchr */

115

116

/*********************************************************************

117

Parse an html tag.

118

e is pointer to the begining of the element (*e must be '<').

119

eof is pointer to the end of the html page.

120

Result parameters:

121

parsed tag name is stored in name, it's length is namelen.

122

first attribute is stored in attr.

123

end points to first character past the html tag.

124

lines records the number of newlines consumed by the tag.

125

*********************************************************************/

126

127

bool htmlAttrVal_nl; /* allow nl in attribute values */

128

129

bool

130

parseTag(char *e,

131

const char **name, int *namelen, const char **attr, const char **end,

132

int *lines)

133

{

134

int lns = 0;

135

if(*e++ != '<')

136

return false;

137

if(name)

138

*name = e;

139

if(*e == '/')

140

e++;

141

if(!isA(*e))

142

return false;

143

while(isA(*e) || *e == '=')

144

++e;

145

if(!isspaceByte(*e) && *e != '>' && *e != '<' && *e != '/' && *e != ':')

146

return false;

147

/* Note that name includes the leading / */

148

if(name && namelen)

149

*namelen = e - *name;

150

/* skip past space colon slash */

151

while(isspaceByte(*e) || *e == '/' || *e == ':') {

152

if(*e == '\n')

153

++lns;

154

++e;

155

}

156

/* should be the start of the first attribute, or < or > */

157

if(!atchr(*e) && *e != '>' && *e != '<')

158

return false;

159

if(attr)

160

*attr = e;

161

nextattr:

162

if(*e == '>' || *e == '<')

163

goto en;

164

if(!atchr(*e))

165

return false;

166

while(atchr(*e))

167

++e;

168

while(isspaceByte(*e)) {

169

if(*e == '\n')

170

++lns;

171

++e;

172

}

173

if(*e != '=')

174

goto nextattr;

175

++e;

176

while(isspaceByte(*e)) {

177

if(*e == '\n')

178

++lns;

179

++e;

180

}

181

if(isquote(*e)) {

182

unsigned char uu = *e;

183

x3:

184

++e;

185

while(*e != uu && *e) {

186

if(*e == '\n')

187

++lns;

188

++e;

189

}

190

if(*e != uu)

191

return false;

192

++e;

193

if(*e == uu) {

194

/* lots of tags end with an extra quote */

195

if(e[1] == '>')

196

*e = ' ';

197

else

198

goto x3;

199

}

200

} else {

201

while(!isspaceByte(*e) && *e != '>' && *e != '<' && *e)

202

++e;

203

}

204

while(isspaceByte(*e)) {

205

if(*e == '\n')

206

++lns;

207

++e;

208

}

209

goto nextattr;

210

en:

211

/* could be < or > */

212

if(end)

213

*end = e + (*e == '>');

214

if(lines)

215

*lines = lns;

216

return true;

217

} /* parseTag */

218

219

/* Don't know why he didn't use the stringAndChar() functions, but he

220

* invented something new here, so on we go. */

221

static void

222

valChar(char **sp, int *lp, char c)

223

{

224

char *s = *sp;

225

int l = *lp;

226

if(!(l % ALLOC_GR))

227

*sp = s = reallocMem(s, l + ALLOC_GR);

228

s[l++] = c;

229

*lp = l;

230

} /* valChar */

231

232

/*********************************************************************

233

Find an attribute in an html tag.

234

e is attr pointer previously gotten from parseTag, DON'T PASS HERE ANY OTHER VALUE!!!

235

name is the sought attribute.

236

returns allocated string containing the attribute, or NULL on unsuccess.

237

*********************************************************************/

238

239

char *

240

htmlAttrVal(const char *e, const char *name)

241

{

242

const char *n;

243

char *a = EMPTYSTRING; /* holds the value */

244

char *b;

245

int l = 0; /* length */

246

char f;

247

if(!e)

248

return a;

249

top:

250

while(isspaceByte(*e))

251

e++;

252

if(!*e)

253

return 0;

254

if(*e == '>' || *e == '<')

255

return 0;

256

n = name;

257

while(*n && !((*e ^ *n) & 0xdf))

258

e++, n++;

259

f = *n;

260

while(atchr(*e))

261

f = 'x', e++;

262

while(isspaceByte(*e))

263

e++;

264

if(*e != '=')

265

goto ea;

266

e++;

267

while(isspaceByte(*e))

268

e++;

269

if(!isquote(*e)) {

270

while(*e && !isspaceByte(*e) && *e != '>' && *e != '<') {

271

if(!f)

272

valChar(&a, &l, *e);

273

e++;

274

}

275

} else {

276

char uu = *e;

277

278

e++;

279

while(*e != uu) {

280

if(!*e) {

281

nzFree(a);

282

return NULL;

283

}

284

if(!f && *e != '\r') {

285

if(*e != '\t' && *e != '\n')

286

valChar(&a, &l, *e);

287

else if(!htmlAttrVal_nl)

288

valChar(&a, &l, ' ');

289

}

290

e++;

291

}

292

e++;

293

if(*e == uu) {

294

if(!f)

295

valChar(&a, &l, uu);

296

goto a;

297

}

298

}

299

ea:

300

if(f)

301

goto top; /* no match, next attribute */

302

if(l)

303

valChar(&a, &l, 0); /* null terminate */

304

if(strchr(a, '&')) {

305

b = a;

306

a = andTranslate(b, true);

307

nzFree(b);

308

}

309

/* strip leading and trailing spaces.

310

* Are we really suppose to do this? */

311

for(b = a; *b == ' '; b++) ;

312

if(b > a)

313

strcpy(a, b);

314

for(b = a + strlen(a) - 1; b >= a && *b == ' '; b--)

315

*b = 0;

316

return a;

317

} /* htmlAttrVal */

318

319

320

/*********************************************************************

321

Jump straight to the </script>, and don't look at anything in between.

322

Result parameters:

323

end of the script, the extracted script, and the number of newlines.

324

*********************************************************************/

325

326

bool

327

findEndScript(const char *h, const char *tagname,

328

bool is_js, char **end_p, char **new_p, int *lines)

329

{

330

char *end;

331

bool rc = true;

332

const char *s = h;

333

char look[12];

334

int js_nl = 0;

335

336

sprintf(look, "</%s>", tagname);

337

338

retry:

339

end = strstrCI(s, look);

340

if(!end) {

341

rc = false;

342

browseError("no closing %s", look);

343

end = (char *)h + strlen(h);

344

} else if(is_js) {

345

/* Check for document.write("</script>");

346

* This isn't legal javascript, but it happens all the time!

347

* This is a really stupid check.

348

* Scan forward 30 chars, on the same line, looking

349

* for a quote, and ) ; or + */

350

char c;

351

int j;

352

s = end + strlen(look);

353

for(j = 0; j < 30; ++j, ++s) {

354

c = *s;

355

if(!c)

356

break;

357

if(c == '\n')

358

break;

359

if(c != '"' && c != '\'')

360

continue;

361

while(s[1] == ' ')

362

++s;

363

c = s[1];

364

if(!c)

365

break;

366

if(strchr(";)+", c))

367

goto retry;

368

}

369

}

370

if(end_p)

371

*end_p = end;

372

if(new_p)

373

*new_p = pullString1(h, end);

374

/* count the newlines */

375

while(h < end) {

376

if(*h == '\n')

377

++js_nl;

378

++h;

379

}

380

381

*lines = js_nl;

382

return rc;

383

} /* findEndScript */

384

385

386

/*********************************************************************

387

The primary goal of this routine is to turn

388

Hey,{ click here } for more information

389

into

390

Hey, {click here} for more information

391

But of course we won't do that if the section is preformatted.

392

Nor can we muck with the whitespace that might be present in an input field <>.

393

State variables remember:

394

Whether we are in a preformatted section

395

Whether we have seen any visible text in the document

396

Whether we have seen any visible text in the current hyperlink,

397

between the braces.

398

Whether we are stepping through a span of whitespace.

399

A tag and adjacent whitespace might be swapped, depending on state.

400

If a change is made, the procedure is run again,

401

kinda like bubble sort.

402

It has the potential to be terribly inefficient,

403

but that's not likely.

404

Use cnt to count the iterations, just for debugging.

405

*********************************************************************/

406

407

void

408

anchorSwap(char *buf)

409

{

410

char c, d, *s, *ss, *w, *a;

411

bool premode, pretag, state_braces, state_text, state_atext;

412

bool strong, change, slash;

413

int n, cnt;

414

char tag[20];

415

416

/* Transliterate a few characters. One of them is 0xa0 to space,

417

* so we need to do this now, before the anchors swap with whitespace.

418

* Also get rid of hyperlinks with absolutely nothing to click on. */

419

for(s = w = buf; c = *s; ++s) {

420

static const char from[] =

421

"\x1b\x95\x99\x9c\x9d\x91\x92\x93\x94\xa0\xad\x96\x97\x85\xa6\xc2";

422

static const char becomes[] = "_*'`'`'`' ----- ";

423

ss = strchr(from, c);

424

if(ss)

425

c = becomes[ss - from];

426

if(c != (char)InternalCodeChar)

427

goto put1;

428

if(!isdigitByte(s[1]))

429

goto put1;

430

for(a = s + 2; isdigitByte(*a); ++a) ;

431

if(*a != '{')

432

goto put1;

433

for(++a; *a == ' '; ++a) ;

434

if(memcmp(a, "\2000}", 3))

435

goto put1;

436

s = a + 2;

437

continue;

438

put1:

439

*w++ = c;

440

}

441

*w = 0;

442

443

cnt = 0;

444

change = true;

445

while(change) {

446

change = false;

447

++cnt;

448

premode = state_text = state_atext = state_braces = false;

449

/* w represents the state of whitespace */

450

w = 0;

451

/* a represents the state of being in an anchor */

452

a = 0;

453

454

for(s = buf; c = *s; ++s) {

455

if(isspaceByte(c)) {

456

if(!w)

457

w = s;

458

continue;

459

}

460

461

/* end of white space, should we swap it with prior tag? */

462

if(w && a && !premode &&

463

((state_braces & !state_atext) ||

464

((!state_braces) & !state_text))) {

465

memcpy(a, w, s - w);

466

memcpy(a + (s - w), tag, n);

467

change = true;

468

w = 0;

469

}

470

471

/* prior anchor has no significance */

472

a = 0;

473

474

if(c == (char)InternalCodeChar) {

475

if(!isdigitByte(s[1]))

476

goto normalChar;

477

n = strtol(s + 1, &ss, 10);

478

preFormatCheck(n, &pretag, &slash);

479

d = *ss;

480

/* the following should never happen */

481

if(!strchr("{}<>*", d))

482

goto normalChar;

483

n = ss + 1 - s;

484

memcpy(tag, s, n);

485

tag[n] = 0;

486

487

if(pretag) {

488

w = 0;

489

premode = !slash;

490

s = ss;

491

continue;

492

}

493

494

/* We have a tag, should we swap it with prior whitespace? */

495

if(w && !premode &&

496

(d == '}' ||

497

d == '@' &&

498

((state_braces & state_atext) ||

499

((!state_braces) & state_text)))) {

500

memmove(w + n, w, s - w);

501

memcpy(w, tag, n);

502

change = true;

503

w += n;

504

if(d == '}')

505

state_braces = false;

506

s = ss;

507

continue;

508

}

509

510

/* prior whitespace doesn't matter any more */

511

w = 0;

512

513

if(d == '{') {

514

state_braces = state_text = true;

515

state_atext = false;

516

a = s;

517

s = ss;

518

continue;

519

}

520

521

if(d == '}') {

522

state_braces = false;

523

s = ss;

524

continue;

525

}

526

527

if(d == '*') {

528

if(state_braces)

529

state_atext = true;

530

else

531

state_text = true;

532

a = s;

533

s = ss;

534

continue;

535

}

536

537

/* The remaining tags are <>, for an input field. */

538

s = ss;

539

c = d;

540

/* end of tag processing */

541

}

542

543

normalChar:

544

w = 0; /* no more whitespace */

545

if(state_braces)

546

state_atext = true;

547

else

548

state_text = true;

549

/* end of loop over the chars in the buffer */

550

}

551

/* end of loop making changes */

552

}

553

debugPrint(3, "anchorSwap %d", cnt);

554

555

/* Framing characters like [] around an anchor are unnecessary here,

556

* because we already frame it in braces.

557

* Get rid of these characters, even in premode.

558

* Also, remove trailing pipes on a line. */

559

ss = 0; /* remember location of first pipe */

560

for(s = w = buf; c = *s; ++s) {

561

char open, close, linkchar;

562

if(!strchr("{[(<", c))

563

goto putc;

564

if(s[1] != (char)InternalCodeChar)

565

goto putc;

566

if(!isdigitByte(s[2]))

567

goto putc;

568

for(a = s + 3; isdigitByte(*a); ++a) ;

569

linkchar = 0;

570

if(*a == '{')

571

linkchar = '}';

572

if(*a == '<')

573

linkchar = '>';

574

if(!linkchar)

575

goto putc;

576

open = c;

577

close = 0;

578

if(open == '{')

579

close = '}';

580

if(open == '[')

581

close = ']';

582

if(open == '(')

583

close = ')';

584

if(open == '<')

585

close = '>';

586

n = 1;

587

while(n < 120) {

588

d = a[n++];

589

if(!d)

590

break;

591

if(d != (char)InternalCodeChar)

592

continue;

593

while(isdigitByte(a[n]))

594

++n;

595

d = a[n++];

596

if(!d)

597

break; /* should never happen */

598

if(strchr("{}<>", d))

599

break;

600

}

601

if(n >= 120)

602

goto putc;

603

if(d != linkchar)

604

goto putc;

605

a += n;

606

if(*a != close)

607

goto putc;

608

++s;

609

memcpy(w, s, a - s);

610

w += a - s;

611

s = a;

612

ss = 0;

613

continue;

614

putc:

615

if(c == '|' && !ss)

616

ss = w;

617

if(strchr("\r\n\f", c) && ss)

618

w = ss, ss = 0;

619

if(!isspaceByte(c) && c != '|')

620

ss = 0;

621

*w++ = c;

622

} /* loop over buffer */

623

*w = 0;

624

debugPrint(3, "anchors unframed");

625

626

/* Now compress the implied linebreaks into one. */

627

premode = false;

628

for(s = buf; c = *s; ++s) {

629

if(c == (char)InternalCodeChar && isdigitByte(s[1])) {

630

n = strtol(s + 1, &s, 10);

631

if(*s == '*') {

632

preFormatCheck(n, &pretag, &slash);

633

if(pretag)

634

premode = !slash;

635

}

636

}

637

if(!isspaceByte(c))

638

continue;

639

strong = false;

640

a = 0;

641

for(w = s; isspaceByte(*w); ++w) {

642

if(*w == '\n' || *w == '\f')

643

strong = true;

644

if(*w == '\r' && !a)

645

a = w;

646

}

647

ss = s, s = w - 1;

648

if(!a)

649

continue;

650

if(premode)

651

continue;

652

if(strong) {

653

for(w = ss; w <= s; ++w)

654

if(*w == '\r')

655

*w = ' ';

656

continue;

657

}

658

for(w = ss; w <= s; ++w)

659

if(*w == '\r' && w != a)

660

*w = ' ';

661

} /* loop over buffer */

662

debugPrint(3, "whitespace combined");

663

} /* anchorSwap */

664

665

666

/*********************************************************************

667

Format text, and break lines at sentence/phrase boundaries.

668

The prefix bl means breakline.

669

*********************************************************************/

670

671

static char *bl_start, *bl_cursor, *bl_end;

672

static bool bl_overflow;

673

static int colno; /* column number */

674

static const int optimalLine = 80; /* optimal line length */

675

static const int cutLineAfter = 36; /* cut sentence after this column */

676

static const int paraLine = 120; /* paragraph in a line */

677

static int longcut, pre_cr;

678

static int lspace; /* last space value, 3 = paragraph */

679

/* Location of period comma rightparen or any word.

680

* Question mark is equivalent to period etc.

681

* Other things being equal, we break at period, rather than comma, etc.

682

* First the column numbers, then the index into the string. */

683

static int lperiod, lcomma, lright, lany;

684

static int idxperiod, idxcomma, idxright, idxany;

685

686

static void

687

debugChunk(const char *chunk, int len)

688

{

689

int i;

690

if(debugLevel < 7)

691

return;

692

printf("chunk<");

693

for(i = 0; i < len; ++i) {

694

char c = chunk[i];

695

if(c == '\t') {

696

printf("\\t");

697

continue;

698

}

699

if(c == '\n') {

700

printf("\\n");

701

continue;

702

}

703

if(c == '\f') {

704

printf("\\f");

705

continue;

706

}

707

if(c == '\r') {

708

printf("\\r");

709

continue;

710

}

711

if(c == '\0') {

712

printf("\\0");

713

continue;

714

}

715

printf("%c", c);

716

}

717

printf(">%d.%d\n", colno, lspace);

718

} /* debugChunk */

719

720

static void

721

appendOneChar(char c)

722

{

723

if(bl_cursor == bl_end)

724

bl_overflow = true;

725

else

726

*bl_cursor++ = c;

727

} /* appendOneChar */

728

729

static bool

730

spaceNotInInput(void)

731

{

732

char *t = bl_cursor;

733

char c;

734

for(--t; t >= bl_start; --t) {

735

c = *t;

736

if(c == '\n' || c == '\r')

737

return true;

738

if(c == '>' && t >= bl_start + 2 &&

739

t[-1] == '0' && t[-2] == (char)InternalCodeChar)

740

return true;

741

if(c != '<')

742

continue;

743

while(t > bl_start && isdigitByte(t[-1]))

744

--t;

745

if(*t == '<')

746

continue;

747

if(t > bl_start && t[-1] == (char)InternalCodeChar)

748

return false;

749

}

750

return true;

751

} /* spaceNotInInput */

752

753

static void

754

appendSpaceChunk(const char *chunk, int len, bool premode)

755

{

756

int nlc = pre_cr; /* newline count */

757

int spc = 0; /* space count */

758

int i, j;

759

char c, d, e;

760

761

if(!len)

762

return;

763

for(i = 0; i < len; ++i) {

764

c = chunk[i];

765

if(c == '\n' || c == '\r') {

766

++nlc, spc = 0;

767

continue;

768

}

769

if(c == '\f') {

770

nlc += 2, spc = 0;

771

continue;

772

}

773

++spc;

774

}

775

776

if(!premode && spaceNotInInput()) {

777

int l = bl_cursor - bl_start;

778

c = d = ' ';

779

if(l)

780

d = bl_cursor[-1];

781

if(l > 1)

782

c = bl_cursor[-2];

783

e = d;

784

if(strchr(")\"|}", d))

785

e = c;

786

if(strchr(".?!:", e)) {

787

bool ok = true;

788

/* Check for Mr. Mrs. and others. */

789

if(e == '.' && bl_cursor - bl_start > 10) {

790

static const char *const prefix[] =

791

{ "mr.", "mrs.", "sis.", "ms.", 0 };

792

char trailing[12];

793

for(i = 0; i < 6; ++i) {

794

c = bl_cursor[i - 6];

795

if(isupperByte(c))

796

c = tolower(c);

797

trailing[i] = c;

798

}

799

trailing[i] = 0;

800

for(i = 0; prefix[i]; ++i)

801

if(strstr(trailing, prefix[i]))

802

ok = false;

803

/* Check for John C. Calhoon */

804

if(isupperByte(bl_cursor[-2]) && isspaceByte(bl_cursor[-3]))

805

ok = false;

806

}

807

if(ok)

808

lperiod = colno, idxperiod = l;

809

}

810

e = d;

811

if(strchr(")\"|", d))

812

e = c;

813

if(strchr("-,;", e))

814

lcomma = colno, idxcomma = l;

815

if(strchr(")\"|", d))

816

lright = colno, idxright = l;

817

lany = colno, idxany = l;

818

/* tack a short fragment onto the previous line. */

819

if(longcut && colno <= 15 && (nlc || lperiod == colno)) {

820

bl_start[longcut] = ' ';

821

if(!nlc)

822

len = spc = 0, nlc = 1;

823

} /* pasting small fragment onto previous line */

824

} /* allowing line breaks */

825

if(lspace == 3)

826

nlc = 0;

827

if(nlc) {

828

if(lspace == 2)

829

nlc = 1;

830

appendOneChar('\n');

831

if(nlc > 1)

832

appendOneChar('\n');

833

colno = 1;

834

longcut = lperiod = lcomma = lright = lany = 0;

835

if(lspace >= 2 || nlc > 1)

836

lspace = 3;

837

if(lspace < 2)

838

lspace = 2;

839

if(!premode)

840

return;

841

}

842

if(!spc)

843

return;

844

if(!premode) {

845

/* if the first char of the text to be reformatted is space,

846

* then we will wind up here, with lspace = 3. */

847

if(lspace == 3)

848

return;

849

appendOneChar(' ');

850

++colno;

851

lspace = 1;

852

return;

853

}

854

j = -1;

855

for(i = 0; i < len; ++i) {

856

c = chunk[i];

857

if(c == '\n' || c == '\r' || c == '\f')

858

j = i;

859

}

860

i = j + 1;

861

if(i)

862

colno = 1;

863

for(; i < len; ++i) {

864

c = chunk[i];

865

if(c == 0)

866

c = ' ';

867

appendOneChar(c);

868

if(c == ' ')

869

++colno;

870

if(c == '\t')

871

colno += 4;

872

}

873

lspace = 1;

874

} /* appendSpaceChunk */

875

876

static void

877

appendPrintableChunk(const char *chunk, int len, bool premode)

878

{

879

int i, j;

880

for(i = 0; i < len; ++i)

881

appendOneChar(chunk[i]);

882

colno += len;

883

lspace = 0;

884

if(premode)

885

return;

886

if(colno <= optimalLine)

887

return;

888

/* Oops, line is getting long. Let's see where we can cut it. */

889

i = j = 0;

890

if(lperiod > cutLineAfter)

891

i = lperiod, j = idxperiod;

892

else if(lcomma > cutLineAfter)

893

i = lcomma, j = idxcomma;

894

else if(lright > cutLineAfter)

895

i = lright, j = idxright;

896

else if(lany > cutLineAfter)

897

i = lany, j = idxany;

898

if(!j)

899

return; /* nothing we can do about it */

900

longcut = 0;

901

if(i != lperiod)

902

longcut = j;

903

bl_start[j] = '\n';

904

colno -= i;

905

lperiod -= i;

906

lcomma -= i;

907

lright -= i;

908

lany -= i;

909

} /* appendPrintableChunk */

910

911

/* Break up a line using the above routines.

912

* The buffer for the new text must be supplied.

913

* Return false (fail) if we ran out of room.

914

* This function is called from bufsup.c, implementing the bl command,

915

* and is only in this file because it shares the above routines and variables

916

* with the html reformatting, which really has to be here. */

917

bool

918

breakLine(const char *line, int len, int *newlen)

919

{

920

char c, state, newstate;

921

int i, last;

922

923

pre_cr = 0;

924

if(len && line[len - 1] == '\r')

925

--len;

926

if(lspace == 4) {

927

/* special continuation code from the previous invokation */

928

lspace = 2;

929

if(line[0])

930

++pre_cr;

931

}

932

if(len > paraLine)

933

++pre_cr;

934

if(lspace < 2)

935

lspace = 2; /* should never happen */

936

if(!len + pre_cr)

937

lspace == 3;

938

bl_start = bl_cursor = replaceLine;

939

bl_end = replaceLine + REPLACELINELEN - 8;

940

bl_overflow = false;

941

colno = 1;

942

longcut = lperiod = lcomma = lright = lany = 0;

943

last = 0;

944

state = 0;

945

if(pre_cr)

946

state = 1;

947

948

for(i = 0; i < len; ++i) {

949

c = line[i];

950

newstate = 2;

951

if(!c || strchr(" \t\n\r\f", c))

952

newstate = 1;

953

if(state == newstate)

954

continue;

955

if(!state) {

956

state = newstate;

957

continue;

958

}

959

960

/* state change here */

961

debugChunk(line + last, i - last);

962

if(state == 1)

963

appendSpaceChunk(line + last, i - last, false);

964

else

965

appendPrintableChunk(line + last, i - last, false);

966

last = i;

967

state = newstate;

968

pre_cr = 0;

969

}

970

971

if(state) { /* last token */

972

debugChunk(line + last, len - last);

973

if(state == 1)

974

appendSpaceChunk(line + last, len - last, false);

975

else

976

appendPrintableChunk(line + last, len - last, false);

977

}

978

979

if(lspace < 2) { /* line didn't have a \r at the end */

980

appendSpaceChunk("\n", 1, false);

981

}

982

if(bl_cursor - bl_start > paraLine)

983

lspace = 4;

984

debugPrint(7, "chunk<EOL>%d.%d", colno, lspace);

985

*newlen = bl_cursor - bl_start;

986

return !bl_overflow;

987

} /* breakLine */

988

989

void

990

breakLineSetup(void)

991

{

992

lspace = 3;

993

}

994

995

char *

996

htmlReformat(const char *buf)

997

{

998

const char *h, *nh, *s;

999

char c;

1000

bool premode = false;

1001

bool pretag, slash;

1002

char *new;

1003

int l, tagno;

1004

1005

longcut = lperiod = lcomma = lright = lany = 0;

1006

colno = 1;

1007

pre_cr = 0;

1008

lspace = 3;

1009

bl_start = bl_cursor = replaceLine;

1010

bl_end = replaceLine + REPLACELINELEN - 8;

1011

bl_overflow = false;

1012

new = initString(&l);

1013

1014

for(h = buf; (c = *h); h = nh) {

1015

if(isspaceByte(c)) {

1016

for(s = h + 1; isspaceByte(*s); ++s) ;

1017

nh = s;

1018

appendSpaceChunk(h, nh - h, premode);

1019

if(lspace == 3 || lspace == 2 &&

1020

(bl_cursor - bl_start) >= (bl_end - bl_start) * 2 / 3) {

1021

if(bl_cursor > bl_start)

1022

stringAndBytes(&new, &l, bl_start, bl_cursor - bl_start);

1023

bl_cursor = bl_start;

1024

lspace = 3;

1025

longcut = lperiod = lcomma = lright = lany = 0;

1026

colno = 1;

1027

}

1028

continue;

1029

}

1030

/* white space */

1031

if(c != (char)InternalCodeChar) {

1032

for(s = h + 1; *s; ++s)

1033

if(isspaceByte(*s) || *s == (char)InternalCodeChar)

1034

break;

1035

nh = s;

1036

appendPrintableChunk(h, nh - h, premode);

1037

continue;

1038

}

1039

1040

/* word */

1041

/* It's a tag */

1042

tagno = strtol(h + 1, (char **)&nh, 10);

1043

c = *nh++;

1044

if(!c || !strchr("{}<>*", c))

1045

errorPrint("@tag code %d has bad character %c following", tagno, c);

1046

appendPrintableChunk(h, nh - h, premode);

1047

preFormatCheck(tagno, &pretag, &slash);

1048

if(pretag)

1049

premode = !slash;

1050

1051

/* Insert newlines between adjacent hyperlinks. */

1052

if(c != '}' || premode)

1053

continue;

1054

for(h = nh; c = *h; ++h)

1055

if(!strchr(" \t,:-|;", c))

1056

break;

1057

if(!c || strchr("\r\n\f", c)) {

1058

nh = h;

1059

continue;

1060

}

1061

if(c != (char)InternalCodeChar)

1062

continue;

1063

/* Does this start a new hyperlink? */

1064

for(s = h + 1; isdigitByte(*s); ++s) ;

1065

if(*s != '{')

1066

continue;

1067

appendSpaceChunk("\n", 1, false);

1068

nh = h;

1069

} /* loop over text */

1070

1071

/* close off the last line */

1072

if(lspace < 2)

1073

appendSpaceChunk("\n", 1, true);

1074

if(bl_cursor > bl_start)

1075

stringAndBytes(&new, &l, bl_start, bl_cursor - bl_start);

1076

/* Get rid of last space. */

1077

if(l >= 2 && new[l - 1] == '\n' && new[l - 2] == ' ')

1078

new[l - 2] = '\n', new[--l] = 0;

1079

/* Don't need empty lines at the end. */

1080

while(l > 1 && new[l - 1] == '\n' && new[l - 2] == '\n')

1081

--l;

1082

new[l] = 0;

1083

/* Don't allow an empty buffer */

1084

if(!l)

1085

stringAndChar(&new, &l, '\n');

1086

1087

return new;

1088

} /* htmlReformat */

1089

1090

1091

/*********************************************************************

1092

And-convert the string; you know,   < etc.

1093

This is the routine that makes it possible for me to read, and write,

1094

my math site. http://www.mathreference.com/accessible.html

1095

In the invisible mode, graphics characters are not rendered at all.

1096

This is used when translating attributes inside tags,

1097

such as HREF, in an anchor.

1098

The original string is not disturbed.

1099

The new string is allocated.

1100

*********************************************************************/

1101

1102

char *

1103

andTranslate(const char *s, bool invisible)

1104

{

1105

char *new;

1106

int l, n, j;

1107

uchar c, d;

1108

uchar alnum = 0; /* was last char an alphanumeric */

1109

bool premode;

1110

char andbuf[16];

1111

1112

static const char *const andwords[] = {

1113

"gt\0>",

1114

"lt\0<",

1115

"quot\0\"",

1116

"raquo\0-",

1117

"ldquo\0\"",

1118

"rdquo\0\"",

1119

"lsquo\0'",

1120

"rsquo\0'",

1121

"plus\0+",

1122

"minus\0-",

1123

"mdash\0 - ",

1124

"ndash\0 - ",

1125

"colon\0:",

1126

"apos\0`",

1127

"star\0*",

1128

"comma\0,",

1129

"period\0.",

1130

"dot\0.",

1131

"dollar\0$",

1132

"percnt\0%",

1133

"amp\0&",

1134

"iexcl\0!",

1135

"ntilde\0\xf1",

1136

"Ntilde\0\xd1",

1137

"agrave\0\xe0",

1138

"Agrave\0\xc0",

1139

"egrave\0\xe8",

1140

"Egrave\0\xc8",

1141

"igrave\0\xec",

1142

"Igrave\0\xcc",

1143

"ograve\0\xf2",

1144

"Ograve\0\xd2",

1145

"ugrave\0\xf9",

1146

"Ugrave\0\xd9",

1147

"auml\0\xe4",

1148

"Auml\0\xc4",

1149

"euml\0\xeb",

1150

"Euml\0\xcb",

1151

"iuml\0\xef",

1152

"Iuml\0\xcf",

1153

"ouml\0\xf6",

1154

"Ouml\0\xd6",

1155

"uuml\0\xfc",

1156

"Uuml\0\xdc",

1157

"yuml\0\xff",

1158

"Yuml\0Y",

1159

"aacute\0\xe1",

1160

"Aacute\0\xc1",

1161

"eacute\0\xe9",

1162

"Eacute\0\xc9",

1163

"iacute\0\xed",

1164

"Iacute\0\xcd",

1165

"oacute\0\xf3",

1166

"Oacute\0\xd3",

1167

"uacute\0\xfa",

1168

"Uacute\0\xda",

1169

"yacute\0\xfd",

1170

"Yacute\0\xdd",

1171

"atilde\0\xe3",

1172

"Atilde\0\xc3",

1173

"itilde\0i",

1174

"Itilde\0I",

1175

"otilde\0\xf5",

1176

"Otilde\0\xd5",

1177

"utilde\0u",

1178

"Utilde\0U",

1179

"acirc\0\xe2",

1180

"Acirc\0\xc2",

1181

"ecirc\0\xea",

1182

"Ecirc\0\xca",

1183

"icirc\0\xee",

1184

"Icirc\0\xce",

1185

"ocirc\0\xf4",

1186

"Ocirc\0\xd4",

1187

"ucirc\0\xfb",

1188

"Ucirc\0\xdb",

1189

"pound\0\xa3",

1190

"cent\0\xa2",

1191

"sdot\0\xb7",

1192

"middot\0\xb7",

1193

"edot\0e",

1194

"nbsp\0 ",

1195

"times\0\xd7",

1196

"divide\0\xf7",

1197

"deg\0\xb0",

1198

"copy\0\xa9",

1199

"reg\0\xae",

1200

"frac14\0\xbc",

1201

"half\0\xbd",

1202

"frac34\0\xbe",

1203

"frac13\01/3",

1204

"frac23\02/3",

1205

"plusmn\0+-",

1206

"laquo\0left arrow",

1207

"#171\0left arrow",

1208

"raquo\0arrow",

1209

"#187\0arrow",

1210

"micro\0micro",

1211

"trade\0(TM)",

1212

"hellip\0...",

1213

"#275\0`",

1214

"#913\0Alpha",

1215

"#914\0Beta",

1216

"#915\0Gamma",

1217

"#916\0Delta",

1218

"#917\0Epsilon",

1219

"#918\0Zeta",

1220

"#919\0Eta",

1221

"#920\0Theta",

1222

"#921\0Iota",

1223

"#922\0Kappa",

1224

"#923\0Lambda",

1225

"#924\0Mu",

1226

"#925\0Nu",

1227

"#926\0Xi",

1228

"#927\0Omicron",

1229

"#928\0Pi",

1230

"#929\0Rho",

1231

"#931\0Sigma",

1232

"#932\0Tau",

1233

"#933\0Upsilon",

1234

"#934\0Phi",

1235

"#935\0Chi",

1236

"#936\0Psi",

1237

"#937\0Omega",

1238

"#945\0alpha",

1239

"#946\0beta",

1240

"#947\0gamma",

1241

"#948\0delta",

1242

"#949\0epsilon",

1243

"#950\0zeta",

1244

"#951\0eta",

1245

"#952\0theta",

1246

"#953\0iota",

1247

"#954\0kappa",

1248

"#955\0lambda",

1249

"#956\0mu",

1250

"#957\0nu",

1251

"#958\0xi",

1252

"#959\0omicron",

1253

"#960\0pi",

1254

"#961\0rho",

1255

"#962\0sigmaf",

1256

"#963\0sigma",

1257

"#964\0tau",

1258

"#965\0upsilon",

1259

"#966\0phi",

1260

"#967\0chi",

1261

"#968\0psi",

1262

"#969\0omega",

1263

"#177\0+-",

1264

"#8211\0-",

1265

"#8212\0 - ",

1266

"#8216\0`",

1267

"#8217\0'",

1268

"#8220\0`",

1269

"#8221\0'",

1270

"#8226\0*",

1271

"#8230\0...",

1272

"#8242\0prime",

1273

"#8501\0aleph",

1274

"#8592\0left arrow",

1275

"#8593\0up arrow",

1276

"#8594\0arrow",

1277

"#8595\0down arrow",

1278

"#8660\0double arrow",

1279

"#8704\0every",

1280

"#8706\0d",

1281

"#8707\0some",

1282

"#8709\0empty set",

1283

"#8711\0del",

1284

"#8712\0member of",

1285

"#8713\0not a member of",

1286

"#8717\0such that",

1287

"#8721\0sum",

1288

"#8734\0infinity",

1289

"#8736\0angle",

1290

"#8745\0intersect",

1291

"#8746\0union",

1292

"#8747\0integral",

1293

"#8773\0congruent to",

1294

"#8800\0not equal",

1295

"#8804\0<=",

1296

"#8805\0>=",

1297

"#8834\0proper subset of",

1298

"#8835\0proper superset of",

1299

"#8836\0not a subset of",

1300

"#8838\0subset of",

1301

"#8839\0superset of",

1302

"#9658\0*",

1303

1304

};

1305

1306

if(!s)

1307

return 0;

1308

if(s == EMPTYSTRING)

1309

return EMPTYSTRING;

1310

new = initString(&l);

1311

1312

while(c = *s) {

1313

if(c == (uchar) InternalCodeChar && !invisible) {

1314

const char *t = s + 1;

1315

while(isdigitByte(*t))

1316

++t;

1317

if(t > s + 1 && *t && strchr("{}<>*", *t)) { /* it's a tag */

1318

bool separate, pretag, slash;

1319

n = atoi(s + 1);

1320

preFormatCheck(n, &pretag, &slash);

1321

separate = (*t != '*');

1322

if(separate)

1323

alnum = 0;

1324

debugPrint(7, "tag %d%c separate %d", n, *t, separate);

1325

if(pretag)

1326

premode = !slash;

1327

++t;

1328

stringAndBytes(&new, &l, s, t - s);

1329

s = t;

1330

continue;

1331

} /* tag */

1332

}

1333

/* code */

1334

j = 1;

1335

if(c != '&')

1336

goto putc;

1337

1338

for(j = 0; j < sizeof (andbuf); ++j) {

1339

d = s[j + 1];

1340

if(d == '&' || d == ';' || d <= ' ')

1341

break;

1342

}

1343

if(j == sizeof (andbuf))

1344

goto putc; /* too long, no match */

1345

strncpy(andbuf, s + 1, j);

1346

andbuf[j] = 0;

1347

++j;

1348

if(s[j] == ';')

1349

++j;

1350

/* remove leading zeros */

1351

if(andbuf[0] == '#')

1352

while(andbuf[1] == '0')

1353

strcpy(andbuf + 1, andbuf + 2);

1354

1355

lookup:

1356

debugPrint(6, "meta %s", andbuf);

1357

n = stringInList(andwords, andbuf);

1358

if(n >= 0) { /* match */

1359

const char *r = andwords[n] + strlen(andwords[n]) + 1; /* replacement string */

1360

s += j;

1361

if(!r[1]) { /* replace with a single character */

1362

c = *r;

1363

--s;

1364

goto putc;

1365

}

1366

if(invisible) {

1367

s -= j;

1368

goto putc;

1369

}

1370

/* We're replacing with a word */

1371

if(!invisible && isalnumByte(*r)) {

1372

/* insert spaces either side */

1373

if(alnum)

1374

stringAndChar(&new, &l, ' ');

1375

alnum = 2;

1376

} else

1377

alnum = 0;

1378

stringAndString(&new, &l, r);

1379

continue;

1380

}

1381

/* match */

1382

if(andbuf[0] != '#')

1383

goto putc;

1384

n = stringIsNum(andbuf + 1);

1385

if(n < 0)

1386

goto putc;

1387

if(n > 255)

1388

goto putc;

1389

c = n;

1390

/* don't allow nulls */

1391

if(c == 0)

1392

c = ' ';

1393

if(strchr("\r\n\f", c) && !premode)

1394

c = ' ';

1395

if(c == (uchar) InternalCodeChar)

1396

c = ' ';

1397

s += j - 1;

1398

j = 1;

1399

1400

putc:

1401

if(isalnumByte(c)) {

1402

if(alnum == 2)

1403

stringAndChar(&new, &l, ' ');

1404

alnum = 1;

1405

} else

1406

alnum = 0;

1407

stringAndChar(&new, &l, c);

1408

++s;

1409

} /* loop over input string */

1410

1411

return new;

1412

} /* andTranslate */

Older »