~ubuntu-branches/ubuntu/wily/edbrowse/wily

Viewing changes to src/format.c

Committer: Bazaar Package Importer
Author(s): Kapil Hari Paranjape
Date: 2008-04-09 18:55:23 UTC
mfrom: (1.1.4 upstream) (3.1.1 lenny)
Revision ID: james.westby@ubuntu.com-20080409185523-dqokcloumyn1ibn4

Tags: 3.3.4-1

* New upstream version (3.3.4).
- Convert between iso8859-1 and utf-8 on the fly.
- Support reading of pdf using pdftohtml.
- French translation of html documentation.
- Old html documentation renamed to usersguide.
- Additional documentation on philosophy.
* debian/control:
- Changed homepage to sourcefource site.
- Moved homepage from description to its own field.
- Added "poppler-utils | xpdf-utils" to Recommends.
- Added "www-browser", "mail-reader" and "editor" to Provides.
- Removed "XS-" from Vcs-Svn tag.
- Standards-Version: 3.7.3
* debian/docs: Added new documentation files
  from "doc/" subdirectory.
* debian/watch: Updated to use sourceforge site.
* debian/edbrowse.doc-base:
  - Changed name of upstream provided html documentation from
    "ebdoc.html" to "usersguide.html".
  - Changed section from "net" to "Network/Web Browsing".
* debian/install: Compiled binary is now in "src/".

files added:
doc

doc/man-edbrowse-debian.1

doc/philosophy.html

doc/philosophy_fr.html

doc/sample.ebrc

doc/sample_fr.ebrc

doc/setup.ebrc

doc/ssl-certs

doc/usersguide.html

doc/usersguide_fr.html

src/auth.c

src/buffers.c

src/cookies.c

src/dbapi.h

src/dbinfx.c

src/dbinfx.ec

src/dbodbc.c

src/dbops.c

src/dbstubs.c

src/eb.h

src/eb.p

src/fetchmail.c

src/format.c

src/html.c

src/http.c

src/jsdom.c

src/jsloc.c

src/jsrt

src/main.c

src/makefile

src/makefile.bsd

src/messages.c

src/messages.h

src/sendmail.c

src/stringfile.c

src/tcp.c

src/tcp.h

src/url.c

files removed:
auth.c

buffers.c

cookies.c

dbapi.h

dbinfx.c

dbinfx.ec

dbodbc.c

dbops.c

dbstubs.c

eb.h

eb.p

edbdoc.html

edbdoc_fr.html

fetchmail.c

format.c

html.c

http.c

jsdom.c

jsloc.c

jsrt

main.c

makefile.bsd

man-edbrowse-debian.1

messages.c

messages.h

sample.ebrc

sample_fr.ebrc

sendmail.c

ssl-certs

stringfile.c

tcp.c

tcp.h

url.c

files modified:
CHANGES

COPYING

README

debian/changelog

debian/control

debian/docs

debian/edbrowse.doc-base

debian/examples

debian/install

debian/rules

debian/watch

makefile

todo

Show diffs side-by-side

added added

removed removed

src/format.c

/* format.c

* Format text, establish line breaks, manage whitespace.

* Copyright (c) Karl Dahlke, 2008

* This file is part of the edbrowse project, released under GPL.

#include "eb.h"

/*********************************************************************

Prepare html for text processing.

Change nulls to spaces.

Make sure it doesn't already contain my magic code,

The one I use to indicate a tag.

If it does, well, change them to something else.

I can only hope this doesn't screw up some embedded javascript.

*********************************************************************/

void

prepareForBrowse(char *h, int h_len)

{

int i, j;

for(i = j = 0; i < h_len; ++i) {

if(h[i] == 0)

h[i] = ' ';

if(h[i] == '\b') {

if(i && !strchr("\n\b<>'\"&", h[i - 1]))

--j;

continue;

}

if(h[i] == (char)0xe2 && i < h_len - 1 && h[i + 1] == (char)0x80) {

++i;

continue;

}

if(h[i] == InternalCodeChar)

h[i] = InternalCodeCharAlternate;

h[j++] = h[i];

}

h[j] = 0; /* now it's a string */

/* undos the file */

for(i = j = 0; h[i]; ++i) {

if(h[i] == '\r' && h[i + 1] == '\n')

continue;

h[j++] = h[i];

}

h[j] = 0;

} /* prepareForBrowse */

/*********************************************************************

Skip past an html comment.

Parse an html tag <tag foo=bar>

*********************************************************************/

const char *

skipHtmlComment(const char *h, int *lines)

{

int lns = 0;

bool comm = h[2] == '-' && h[3] == '-';

bool php = memEqualCI(h + 1, "?php", 4);

h += comm ? 4 : 2;

while(*h) {

if(php) { /* special type of comment */

if(*h == '?' && h[1] == '>') {

h += 2;

goto done;

}

++h;

continue;

}

if(!comm && *h == '>') {

++h;

goto done;

}

if(comm && h[0] == '-' && h[1] == '-') {

h += 2;

while(*h == '-')

h++;

while(isspaceByte(*h)) {

if(*h == '\n')

++lns;

h++;

}

if(!*h)

goto done;

if(*h == '>') {

++h;

goto done;

}

continue;

}

if(*h == '\n')

++lns;

100

h++;

101

}

102

103

done:

104

if(lines)

105

*lines = lns;

106

return h;

107

} /* skipHtmlComment */

108

109

/* an attribute character */

110

static bool

111

atchr(char c)

112

{

113

return (c > ' ' && c != '=' && c != '<' && c != '>');

114

} /* atchr */

115

116

/*********************************************************************

117

Parse an html tag.

118

e is pointer to the begining of the element (*e must be '<').

119

eof is pointer to the end of the html page.

120

Result parameters:

121

parsed tag name is stored in name, it's length is namelen.

122

first attribute is stored in attr.

123

end points to first character past the html tag.

124

lines records the number of newlines consumed by the tag.

125

*********************************************************************/

126

127

bool htmlAttrVal_nl; /* allow nl in attribute values */

128

129

bool

130

parseTag(char *e,

131

const char **name, int *namelen, const char **attr, const char **end,

132

int *lines)

133

{

134

int lns = 0;

135

if(*e++ != '<')

136

return false;

137

if(name)

138

*name = e;

139

if(*e == '/')

140

e++;

141

if(!isA(*e))

142

return false;

143

while(isA(*e) || *e == '=')

144

++e;

145

if(!isspaceByte(*e) && *e != '>' && *e != '<' && *e != '/' && *e != ':')

146

return false;

147

/* Note that name includes the leading / */

148

if(name && namelen)

149

*namelen = e - *name;

150

/* skip past space colon slash */

151

while(isspaceByte(*e) || *e == '/' || *e == ':') {

152

if(*e == '\n')

153

++lns;

154

++e;

155

}

156

/* should be the start of the first attribute, or < or > */

157

if(!atchr(*e) && *e != '>' && *e != '<')

158

return false;

159

if(attr)

160

*attr = e;

161

nextattr:

162

if(*e == '>' || *e == '<')

163

goto en;

164

if(!atchr(*e))

165

return false;

166

while(atchr(*e))

167

++e;

168

while(isspaceByte(*e)) {

169

if(*e == '\n')

170

++lns;

171

++e;

172

}

173

if(*e != '=')

174

goto nextattr;

175

++e;

176

while(isspaceByte(*e)) {

177

if(*e == '\n')

178

++lns;

179

++e;

180

}

181

if(isquote(*e)) {

182

unsigned char uu = *e;

183

x3:

184

++e;

185

while(*e != uu && *e) {

186

if(*e == '\n')

187

++lns;

188

++e;

189

}

190

if(*e != uu)

191

return false;

192

++e;

193

if(*e == uu) {

194

/* lots of tags end with an extra quote */

195

if(e[1] == '>')

196

*e = ' ';

197

else

198

goto x3;

199

}

200

} else {

201

while(!isspaceByte(*e) && *e != '>' && *e != '<' && *e)

202

++e;

203

}

204

while(isspaceByte(*e)) {

205

if(*e == '\n')

206

++lns;

207

++e;

208

}

209

goto nextattr;

210

en:

211

/* could be < or > */

212

if(end)

213

*end = e + (*e == '>');

214

if(lines)

215

*lines = lns;

216

return true;

217

} /* parseTag */

218

219

/* Don't know why he didn't use the stringAndChar() functions, but he

220

* invented something new here, so on we go. */

221

static void

222

valChar(char **sp, int *lp, char c)

223

{

224

char *s = *sp;

225

int l = *lp;

226

if(!(l % ALLOC_GR))

227

*sp = s = reallocMem(s, l + ALLOC_GR);

228

s[l++] = c;

229

*lp = l;

230

} /* valChar */

231

232

/*********************************************************************

233

Find an attribute in an html tag.

234

e is attr pointer previously gotten from parseTag, DON'T PASS HERE ANY OTHER VALUE!!!

235

name is the sought attribute.

236

returns allocated string containing the attribute, or NULL on unsuccess.

237

*********************************************************************/

238

239

char *

240

htmlAttrVal(const char *e, const char *name)

241

{

242

const char *n;

243

char *a = EMPTYSTRING; /* holds the value */

244

char *b;

245

int l = 0; /* length */

246

char f;

247

if(!e)

248

return a;

249

top:

250

while(isspaceByte(*e))

251

e++;

252

if(!*e)

253

return 0;

254

if(*e == '>' || *e == '<')

255

return 0;

256

n = name;

257

while(*n && !((*e ^ *n) & 0xdf))

258

e++, n++;

259

f = *n;

260

while(atchr(*e))

261

f = 'x', e++;

262

while(isspaceByte(*e))

263

e++;

264

if(*e != '=')

265

goto ea;

266

e++;

267

while(isspaceByte(*e))

268

e++;

269

if(!isquote(*e)) {

270

while(*e && !isspaceByte(*e) && *e != '>' && *e != '<') {

271

if(!f)

272

valChar(&a, &l, *e);

273

e++;

274

}

275

} else {

276

char uu = *e;

277

278

e++;

279

while(*e != uu) {

280

if(!*e) {

281

nzFree(a);

282

return NULL;

283

}

284

if(!f && *e != '\r') {

285

if(*e != '\t' && *e != '\n')

286

valChar(&a, &l, *e);

287

else if(!htmlAttrVal_nl)

288

valChar(&a, &l, ' ');

289

}

290

e++;

291

}

292

e++;

293

if(*e == uu) {

294

if(!f)

295

valChar(&a, &l, uu);

296

goto a;

297

}

298

}

299

ea:

300

if(f)

301

goto top; /* no match, next attribute */

302

if(l)

303

valChar(&a, &l, 0); /* null terminate */

304

if(strchr(a, '&')) {

305

b = a;

306

a = andTranslate(b, true);

307

nzFree(b);

308

}

309

/* strip leading and trailing spaces.

310

* Are we really suppose to do this? */

311

for(b = a; *b == ' '; b++) ;

312

if(b > a)

313

strcpy(a, b);

314

for(b = a + strlen(a) - 1; b >= a && *b == ' '; b--)

315

*b = 0;

316

return a;

317

} /* htmlAttrVal */

318

319

320

/*********************************************************************

321

Jump straight to the </script>, and don't look at anything in between.

322

Result parameters:

323

end of the script, the extracted script, and the number of newlines.

324

*********************************************************************/

325

326

bool

327

findEndScript(const char *h, const char *tagname,

328

bool is_js, char **end_p, char **new_p, int *lines)

329

{

330

char *end;

331

bool rc = true;

332

const char *s = h;

333

char look[12];

334

int js_nl = 0;

335

336

sprintf(look, "</%s>", tagname);

337

338

retry:

339

end = strstrCI(s, look);

340

if(!end) {

341

rc = false;

342

browseError(MSG_CloseTag, look);

343

end = (char *)h + strlen(h);

344

} else if(is_js) {

345

/* Check for document.write("</script>");

346

* This isn't legal javascript, but it happens all the time!

347

* This is a really stupid check.

348

* Scan forward 30 chars, on the same line, looking

349

* for a quote, and ) ; or + */

350

char c;

351

int j;

352

s = end + strlen(look);

353

for(j = 0; j < 30; ++j, ++s) {

354

c = *s;

355

if(!c)

356

break;

357

if(c == '\n')

358

break;

359

if(c != '"' && c != '\'')

360

continue;

361

while(s[1] == ' ')

362

++s;

363

c = s[1];

364

if(!c)

365

break;

366

if(strchr(";)+", c))

367

goto retry;

368

}

369

}

370

if(end_p)

371

*end_p = end;

372

if(new_p)

373

*new_p = pullString1(h, end);

374

/* count the newlines */

375

while(h < end) {

376

if(*h == '\n')

377

++js_nl;

378

++h;

379

}

380

381

*lines = js_nl;

382

return rc;

383

} /* findEndScript */

384

385

386

/*********************************************************************

387

The primary goal of this routine is to turn

388

Hey,{ click here } for more information

389

into

390

Hey, {click here} for more information

391

But of course we won't do that if the section is preformatted.

392

Nor can we muck with the whitespace that might be present in an input field <>.

393

State variables remember:

394

Whether we are in a preformatted section

395

Whether we have seen any visible text in the document

396

Whether we have seen any visible text in the current hyperlink,

397

between the braces.

398

Whether we are stepping through a span of whitespace.

399

A tag and adjacent whitespace might be swapped, depending on state.

400

If a change is made, the procedure is run again,

401

kinda like bubble sort.

402

It has the potential to be terribly inefficient,

403

but that's not likely.

404

Use cnt to count the iterations, just for debugging.

405

*********************************************************************/

406

407

void

408

anchorSwap(char *buf)

409

{

410

char c, d, *s, *ss, *w, *a;

411

bool premode, pretag, state_braces, state_text, state_atext;

412

bool strong, change, slash;

413

int n, cnt;

414

char tag[20];

415

416

static const char from[] =

417

"\x1b\x95\x99\x9c\x9d\x91\x92\x93\x94\xa0\xad\x96\x97\x85\xa6\xc2";

418

static const char becomes[] = "_*'`'`'`' ----- ";

419

420

/* Transliterate a few characters. One of them is 0xa0 to space,

421

* so we need to do this now, before the anchors swap with whitespace.

422

* Watch out for utf8 - don't translate the a0 in c3a0. That is a grave.

423

* But a0 by itself is breakspace; turn it into space.

424

* And c2a0 is a0 is breakspace.

425

* Then get rid of hyperlinks with absolutely nothing to click on. */

426

427

for(s = w = buf; c = *s; ++s) {

428

d = s[1];

429

/* utf8 test */

430

if((c & 0xc0) == 0xc0 && (d & 0xc0) == 0x80) {

431

unsigned int uni = 0;

432

if((c & 0x3c) == 0) {

433

/* fits in 8 bits */

434

uni = ((uchar) c << 6) | (d & 0x3f);

435

ss = strchr(from, (char)uni);

436

if(ss) {

437

c = becomes[ss - from];

438

++s;

439

goto put1;

440

}

441

}

442

/* copy the utf8 sequence */

443

*w++ = c;

444

++s;

445

c <<= 1;

446

while((c & 0x80) && ((d = *s) & 0xc0) == 0x80) {

447

*w++ = d;

448

++s;

449

}

450

--s;

451

continue;

452

}

453

454

ss = strchr(from, c);

455

if(ss)

456

c = becomes[ss - from];

457

458

if(c != InternalCodeChar)

459

goto put1;

460

if(!isdigitByte(s[1]))

461

goto put1;

462

for(a = s + 2; isdigitByte(*a); ++a) ;

463

if(*a != '{')

464

goto put1;

465

for(++a; *a == ' '; ++a) ;

466

if(a[0] != InternalCodeChar || a[1] != '0' || a[2] != '}')

467

goto put1;

468

s = a + 2;

469

continue;

470

471

put1:

472

*w++ = c;

473

}

474

*w = 0;

475

476

cnt = 0;

477

change = true;

478

while(change) {

479

change = false;

480

++cnt;

481

premode = state_text = state_atext = state_braces = false;

482

/* w represents the state of whitespace */

483

w = 0;

484

/* a represents the state of being in an anchor */

485

a = 0;

486

487

for(s = buf; c = *s; ++s) {

488

if(isspaceByte(c)) {

489

if(!w)

490

w = s;

491

continue;

492

}

493

494

/* end of white space, should we swap it with prior tag? */

495

if(w && a && !premode &&

496

((state_braces & !state_atext) ||

497

((!state_braces) & !state_text))) {

498

memcpy(a, w, s - w);

499

memcpy(a + (s - w), tag, n);

500

change = true;

501

w = 0;

502

}

503

504

/* prior anchor has no significance */

505

a = 0;

506

507

if(c == InternalCodeChar) {

508

if(!isdigitByte(s[1]))

509

goto normalChar;

510

n = strtol(s + 1, &ss, 10);

511

preFormatCheck(n, &pretag, &slash);

512

d = *ss;

513

/* the following should never happen */

514

if(!strchr("{}<>*", d))

515

goto normalChar;

516

n = ss + 1 - s;

517

memcpy(tag, s, n);

518

tag[n] = 0;

519

520

if(pretag) {

521

w = 0;

522

premode = !slash;

523

s = ss;

524

continue;

525

}

526

527

/* We have a tag, should we swap it with prior whitespace? */

528

if(w && !premode &&

529

(d == '}' ||

530

d == '@' &&

531

((state_braces & state_atext) ||

532

((!state_braces) & state_text)))) {

533

memmove(w + n, w, s - w);

534

memcpy(w, tag, n);

535

change = true;

536

w += n;

537

if(d == '}')

538

state_braces = false;

539

s = ss;

540

continue;

541

}

542

543

/* prior whitespace doesn't matter any more */

544

w = 0;

545

546

if(d == '{') {

547

state_braces = state_text = true;

548

state_atext = false;

549

a = s;

550

s = ss;

551

continue;

552

}

553

554

if(d == '}') {

555

state_braces = false;

556

s = ss;

557

continue;

558

}

559

560

if(d == '*') {

561

if(state_braces)

562

state_atext = true;

563

else

564

state_text = true;

565

a = s;

566

s = ss;

567

continue;

568

}

569

570

/* The remaining tags are <>, for an input field. */

571

s = ss;

572

c = d;

573

/* end of tag processing */

574

}

575

576

normalChar:

577

w = 0; /* no more whitespace */

578

if(state_braces)

579

state_atext = true;

580

else

581

state_text = true;

582

/* end of loop over the chars in the buffer */

583

}

584

/* end of loop making changes */

585

}

586

debugPrint(3, "anchorSwap %d", cnt);

587

588

/* Framing characters like [] around an anchor are unnecessary here,

589

* because we already frame it in braces.

590

* Get rid of these characters, even in premode.

591

* Also, remove trailing pipes on a line. */

592

ss = 0; /* remember location of first pipe */

593

for(s = w = buf; c = *s; ++s) {

594

char open, close, linkchar;

595

if(!strchr("{[(<", c))

596

goto putc;

597

if(s[1] != InternalCodeChar)

598

goto putc;

599

if(!isdigitByte(s[2]))

600

goto putc;

601

for(a = s + 3; isdigitByte(*a); ++a) ;

602

linkchar = 0;

603

if(*a == '{')

604

linkchar = '}';

605

if(*a == '<')

606

linkchar = '>';

607

if(!linkchar)

608

goto putc;

609

open = c;

610

close = 0;

611

if(open == '{')

612

close = '}';

613

if(open == '[')

614

close = ']';

615

if(open == '(')

616

close = ')';

617

if(open == '<')

618

close = '>';

619

n = 1;

620

while(n < 120) {

621

d = a[n++];

622

if(!d)

623

break;

624

if(d != InternalCodeChar)

625

continue;

626

while(isdigitByte(a[n]))

627

++n;

628

d = a[n++];

629

if(!d)

630

break; /* should never happen */

631

if(strchr("{}<>", d))

632

break;

633

}

634

if(n >= 120)

635

goto putc;

636

if(d != linkchar)

637

goto putc;

638

a += n;

639

if(*a != close)

640

goto putc;

641

++s;

642

memcpy(w, s, a - s);

643

w += a - s;

644

s = a;

645

ss = 0;

646

continue;

647

putc:

648

if(c == '|' && !ss)

649

ss = w;

650

if(strchr("\r\n\f", c) && ss)

651

w = ss, ss = 0;

652

if(!isspaceByte(c) && c != '|')

653

ss = 0;

654

*w++ = c;

655

} /* loop over buffer */

656

*w = 0;

657

debugPrint(3, "anchors unframed");

658

659

/* Now compress the implied linebreaks into one. */

660

premode = false;

661

for(s = buf; c = *s; ++s) {

662

if(c == InternalCodeChar && isdigitByte(s[1])) {

663

n = strtol(s + 1, &s, 10);

664

if(*s == '*') {

665

preFormatCheck(n, &pretag, &slash);

666

if(pretag)

667

premode = !slash;

668

}

669

}

670

if(!isspaceByte(c))

671

continue;

672

strong = false;

673

a = 0;

674

for(w = s; isspaceByte(*w); ++w) {

675

if(*w == '\n' || *w == '\f')

676

strong = true;

677

if(*w == '\r' && !a)

678

a = w;

679

}

680

ss = s, s = w - 1;

681

if(!a)

682

continue;

683

if(premode)

684

continue;

685

if(strong) {

686

for(w = ss; w <= s; ++w)

687

if(*w == '\r')

688

*w = ' ';

689

continue;

690

}

691

for(w = ss; w <= s; ++w)

692

if(*w == '\r' && w != a)

693

*w = ' ';

694

} /* loop over buffer */

695

debugPrint(3, "whitespace combined");

696

} /* anchorSwap */

697

698

699

/*********************************************************************

700

Format text, and break lines at sentence/phrase boundaries.

701

The prefix bl means breakline.

702

*********************************************************************/

703

704

static char *bl_start, *bl_cursor, *bl_end;

705

static bool bl_overflow;

706

static int colno; /* column number */

707

static const int optimalLine = 80; /* optimal line length */

708

static const int cutLineAfter = 36; /* cut sentence after this column */

709

static const int paraLine = 120; /* paragraph in a line */

710

static int longcut, pre_cr;

711

static int lspace; /* last space value, 3 = paragraph */

712

/* Location of period comma rightparen or any word.

713

* Question mark is equivalent to period etc.

714

* Other things being equal, we break at period, rather than comma, etc.

715

* First the column numbers, then the index into the string. */

716

static int lperiod, lcomma, lright, lany;

717

static int idxperiod, idxcomma, idxright, idxany;

718

719

static void

720

debugChunk(const char *chunk, int len)

721

{

722

int i;

723

if(debugLevel < 7)

724

return;

725

printf("chunk<");

726

for(i = 0; i < len; ++i) {

727

char c = chunk[i];

728

if(c == '\t') {

729

printf("\\t");

730

continue;

731

}

732

if(c == '\n') {

733

printf("\\n");

734

continue;

735

}

736

if(c == '\f') {

737

printf("\\f");

738

continue;

739

}

740

if(c == '\r') {

741

printf("\\r");

742

continue;

743

}

744

if(c == '\0') {

745

printf("\\0");

746

continue;

747

}

748

printf("%c", c);

749

}

750

printf(">%d.%d\n", colno, lspace);

751

} /* debugChunk */

752

753

static void

754

appendOneChar(char c)

755

{

756

if(bl_cursor == bl_end)

757

bl_overflow = true;

758

else

759

*bl_cursor++ = c;

760

} /* appendOneChar */

761

762

static bool

763

spaceNotInInput(void)

764

{

765

char *t = bl_cursor;

766

char c;

767

for(--t; t >= bl_start; --t) {

768

c = *t;

769

if(c == '\n' || c == '\r')

770

return true;

771

if(c == '>' && t >= bl_start + 2 &&

772

t[-1] == '0' && t[-2] == InternalCodeChar)

773

return true;

774

if(c != '<')

775

continue;

776

while(t > bl_start && isdigitByte(t[-1]))

777

--t;

778

if(*t == '<')

779

continue;

780

if(t > bl_start && t[-1] == InternalCodeChar)

781

return false;

782

}

783

return true;

784

} /* spaceNotInInput */

785

786

static void

787

appendSpaceChunk(const char *chunk, int len, bool premode)

788

{

789

int nlc = pre_cr; /* newline count */

790

int spc = 0; /* space count */

791

int i, j;

792

char c, d, e;

793

794

if(!len)

795

return;

796

for(i = 0; i < len; ++i) {

797

c = chunk[i];

798

if(c == '\n' || c == '\r') {

799

++nlc, spc = 0;

800

continue;

801

}

802

if(c == '\f') {

803

nlc += 2, spc = 0;

804

continue;

805

}

806

++spc;

807

}

808

809

if(!premode && spaceNotInInput()) {

810

int l = bl_cursor - bl_start;

811

c = d = ' ';

812

if(l)

813

d = bl_cursor[-1];

814

if(l > 1)

815

c = bl_cursor[-2];

816

e = d;

817

if(strchr(")\"|}", d))

818

e = c;

819

if(strchr(".?!:", e)) {

820

bool ok = true;

821

/* Check for Mr. Mrs. and others. */

822

if(e == '.' && bl_cursor - bl_start > 10) {

823

static const char *const prefix[] =

824

{ "mr.", "mrs.", "sis.", "ms.", 0 };

825

char trailing[12];

826

for(i = 0; i < 6; ++i) {

827

c = bl_cursor[i - 6];

828

if(isupperByte(c))

829

c = tolower(c);

830

trailing[i] = c;

831

}

832

trailing[i] = 0;

833

for(i = 0; prefix[i]; ++i)

834

if(strstr(trailing, prefix[i]))

835

ok = false;

836

/* Check for John C. Calhoon */

837

if(isupperByte(bl_cursor[-2]) && isspaceByte(bl_cursor[-3]))

838

ok = false;

839

}

840

if(ok)

841

lperiod = colno, idxperiod = l;

842

}

843

e = d;

844

if(strchr(")\"|", d))

845

e = c;

846

if(strchr("-,;", e))

847

lcomma = colno, idxcomma = l;

848

if(strchr(")\"|", d))

849

lright = colno, idxright = l;

850

lany = colno, idxany = l;

851

/* tack a short fragment onto the previous line. */

852

if(longcut && colno <= 15 && (nlc || lperiod == colno)) {

853

bl_start[longcut] = ' ';

854

if(!nlc)

855

len = spc = 0, nlc = 1;

856

} /* pasting small fragment onto previous line */

857

} /* allowing line breaks */

858

if(lspace == 3)

859

nlc = 0;

860

if(nlc) {

861

if(lspace == 2)

862

nlc = 1;

863

appendOneChar('\n');

864

if(nlc > 1)

865

appendOneChar('\n');

866

colno = 1;

867

longcut = lperiod = lcomma = lright = lany = 0;

868

if(lspace >= 2 || nlc > 1)

869

lspace = 3;

870

if(lspace < 2)

871

lspace = 2;

872

if(!premode)

873

return;

874

}

875

if(!spc)

876

return;

877

if(!premode) {

878

/* if the first char of the text to be reformatted is space,

879

* then we will wind up here, with lspace = 3. */

880

if(lspace == 3)

881

return;

882

appendOneChar(' ');

883

++colno;

884

lspace = 1;

885

return;

886

}

887

j = -1;

888

for(i = 0; i < len; ++i) {

889

c = chunk[i];

890

if(c == '\n' || c == '\r' || c == '\f')

891

j = i;

892

}

893

i = j + 1;

894

if(i)

895

colno = 1;

896

for(; i < len; ++i) {

897

c = chunk[i];

898

if(c == 0)

899

c = ' ';

900

appendOneChar(c);

901

if(c == ' ')

902

++colno;

903

if(c == '\t')

904

colno += 4;

905

}

906

lspace = 1;

907

} /* appendSpaceChunk */

908

909

static void

910

appendPrintableChunk(const char *chunk, int len, bool premode)

911

{

912

int i, j;

913

for(i = 0; i < len; ++i)

914

appendOneChar(chunk[i]);

915

colno += len;

916

lspace = 0;

917

if(premode)

918

return;

919

if(colno <= optimalLine)

920

return;

921

/* Oops, line is getting long. Let's see where we can cut it. */

922

i = j = 0;

923

if(lperiod > cutLineAfter)

924

i = lperiod, j = idxperiod;

925

else if(lcomma > cutLineAfter)

926

i = lcomma, j = idxcomma;

927

else if(lright > cutLineAfter)

928

i = lright, j = idxright;

929

else if(lany > cutLineAfter)

930

i = lany, j = idxany;

931

if(!j)

932

return; /* nothing we can do about it */

933

longcut = 0;

934

if(i != lperiod)

935

longcut = j;

936

bl_start[j] = '\n';

937

colno -= i;

938

lperiod -= i;

939

lcomma -= i;

940

lright -= i;

941

lany -= i;

942

} /* appendPrintableChunk */

943

944

/* Break up a line using the above routines.

945

* The buffer for the new text must be supplied.

946

* Return false (fail) if we ran out of room.

947

* This function is called from bufsup.c, implementing the bl command,

948

* and is only in this file because it shares the above routines and variables

949

* with the html reformatting, which really has to be here. */

950

bool

951

breakLine(const char *line, int len, int *newlen)

952

{

953

char c, state, newstate;

954

int i, last;

955

956

pre_cr = 0;

957

if(len && line[len - 1] == '\r')

958

--len;

959

if(lspace == 4) {

960

/* special continuation code from the previous invokation */

961

lspace = 2;

962

if(line[0])

963

++pre_cr;

964

}

965

if(len > paraLine)

966

++pre_cr;

967

if(lspace < 2)

968

lspace = 2; /* should never happen */

969

if(!len + pre_cr)

970

lspace == 3;

971

bl_start = bl_cursor = replaceLine;

972

bl_end = replaceLine + REPLACELINELEN - 8;

973

bl_overflow = false;

974

colno = 1;

975

longcut = lperiod = lcomma = lright = lany = 0;

976

last = 0;

977

state = 0;

978

if(pre_cr)

979

state = 1;

980

981

for(i = 0; i < len; ++i) {

982

c = line[i];

983

newstate = 2;

984

if(!c || strchr(" \t\n\r\f", c))

985

newstate = 1;

986

if(state == newstate)

987

continue;

988

if(!state) {

989

state = newstate;

990

continue;

991

}

992

993

/* state change here */

994

debugChunk(line + last, i - last);

995

if(state == 1)

996

appendSpaceChunk(line + last, i - last, false);

997

else

998

appendPrintableChunk(line + last, i - last, false);

999

last = i;

1000

state = newstate;

1001

pre_cr = 0;

1002

}

1003

1004

if(state) { /* last token */

1005

debugChunk(line + last, len - last);

1006

if(state == 1)

1007

appendSpaceChunk(line + last, len - last, false);

1008

else

1009

appendPrintableChunk(line + last, len - last, false);

1010

}

1011

1012

if(lspace < 2) { /* line didn't have a \r at the end */

1013

appendSpaceChunk("\n", 1, false);

1014

}

1015

if(bl_cursor - bl_start > paraLine)

1016

lspace = 4;

1017

debugPrint(7, "chunk<EOL>%d.%d", colno, lspace);

1018

*newlen = bl_cursor - bl_start;

1019

return !bl_overflow;

1020

} /* breakLine */

1021

1022

void

1023

breakLineSetup(void)

1024

{

1025

lspace = 3;

1026

}

1027

1028

char *

1029

htmlReformat(const char *buf)

1030

{

1031

const char *h, *nh, *s;

1032

char c;

1033

bool premode = false;

1034

bool pretag, slash;

1035

char *new;

1036

int l, tagno;

1037

1038

longcut = lperiod = lcomma = lright = lany = 0;

1039

colno = 1;

1040

pre_cr = 0;

1041

lspace = 3;

1042

bl_start = bl_cursor = replaceLine;

1043

bl_end = replaceLine + REPLACELINELEN - 8;

1044

bl_overflow = false;

1045

new = initString(&l);

1046

1047

for(h = buf; (c = *h); h = nh) {

1048

if(isspaceByte(c)) {

1049

for(s = h + 1; isspaceByte(*s); ++s) ;

1050

nh = s;

1051

appendSpaceChunk(h, nh - h, premode);

1052

if(lspace == 3 || lspace == 2 &&

1053

(bl_cursor - bl_start) >= (bl_end - bl_start) * 2 / 3) {

1054

if(bl_cursor > bl_start)

1055

stringAndBytes(&new, &l, bl_start, bl_cursor - bl_start);

1056

bl_cursor = bl_start;

1057

lspace = 3;

1058

longcut = lperiod = lcomma = lright = lany = 0;

1059

colno = 1;

1060

}

1061

continue;

1062

}

1063

/* white space */

1064

if(c != InternalCodeChar) {

1065

for(s = h + 1; *s; ++s)

1066

if(isspaceByte(*s) || *s == InternalCodeChar)

1067

break;

1068

nh = s;

1069

appendPrintableChunk(h, nh - h, premode);

1070

continue;

1071

}

1072

1073

/* word */

1074

/* It's a tag */

1075

tagno = strtol(h + 1, (char **)&nh, 10);

1076

c = *nh++;

1077

if(!c || !strchr("{}<>*", c))

1078

i_printfExit(MSG_BadTagCode, tagno, c);

1079

appendPrintableChunk(h, nh - h, premode);

1080

preFormatCheck(tagno, &pretag, &slash);

1081

if(pretag)

1082

premode = !slash;

1083

1084

/* Insert newlines between adjacent hyperlinks. */

1085

if(c != '}' || premode)

1086

continue;

1087

for(h = nh; c = *h; ++h)

1088

if(!strchr(" \t,:-|;", c))

1089

break;

1090

if(!c || strchr("\r\n\f", c)) {

1091

nh = h;

1092

continue;

1093

}

1094

if(c != InternalCodeChar)

1095

continue;

1096

/* Does this start a new hyperlink? */

1097

for(s = h + 1; isdigitByte(*s); ++s) ;

1098

if(*s != '{')

1099

continue;

1100

appendSpaceChunk("\n", 1, false);

1101

nh = h;

1102

} /* loop over text */

1103

1104

/* close off the last line */

1105

if(lspace < 2)

1106

appendSpaceChunk("\n", 1, true);

1107

if(bl_cursor > bl_start)

1108

stringAndBytes(&new, &l, bl_start, bl_cursor - bl_start);

1109

/* Get rid of last space. */

1110

if(l >= 2 && new[l - 1] == '\n' && new[l - 2] == ' ')

1111

new[l - 2] = '\n', new[--l] = 0;

1112

/* Don't need empty lines at the end. */

1113

while(l > 1 && new[l - 1] == '\n' && new[l - 2] == '\n')

1114

--l;

1115

new[l] = 0;

1116

/* Don't allow an empty buffer */

1117

if(!l)

1118

stringAndChar(&new, &l, '\n');

1119

1120

return new;

1121

} /* htmlReformat */

1122

1123

1124

/*********************************************************************

1125

And-convert the string; you know,   < etc.

1126

This is the routine that makes it possible for me to read, and write,

1127

my math site. http://www.mathreference.com/accessible.html

1128

In the invisible mode, graphics characters are not rendered at all.

1129

This is used when translating attributes inside tags,

1130

such as HREF, in an anchor.

1131

The original string is not disturbed.

1132

The new string is allocated.

1133

*********************************************************************/

1134

1135

char *

1136

andTranslate(const char *s, bool invisible)

1137

{

1138

char *new;

1139

int l, n, j;

1140

uchar c, d;

1141

uchar alnum = 0; /* was last char an alphanumeric */

1142

bool premode;

1143

char andbuf[16];

1144

1145

static const char *const andwords[] = {

1146

"gt\0>",

1147

"lt\0<",

1148

"quot\0\"",

1149

"raquo\0-",

1150

"ldquo\0\"",

1151

"rdquo\0\"",

1152

"lsquo\0'",

1153

"rsquo\0'",

1154

"plus\0+",

1155

"minus\0-",

1156

"mdash\0 - ",

1157

"ndash\0 - ",

1158

"colon\0:",

1159

"apos\0`",

1160

"star\0*",

1161

"comma\0,",

1162

"period\0.",

1163

"dot\0.",

1164

"dollar\0$",

1165

"percnt\0%",

1166

"amp\0&",

1167

"iexcl\0!",

1168

"cent\0\xa2",

1169

"pound\0\xa3",

1170

"yen\0\xa5",

1171

"brvbar\0\xa6",

1172

"copy\0\xa9",

1173

"reg\0\xae",

1174

"deg\0\xb0",

1175

"plusmn\0\xb1",

1176

"para\0\xb6",

1177

"sdot\0\xb7",

1178

"middot\0\xb7",

1179

"frac14\0\xbc",

1180

"half\0\xbd",

1181

"frac34\0\xbe",

1182

"iquest\0\xbf",

1183

"Agrave\0\xc0",

1184

"Aacute\0\xc1",

1185

"Acirc\0\xc2",

1186

"Atilde\0\xc3",

1187

"Auml\0\xc4",

1188

"Aring\0\xc5",

1189

"AElig\0\xc6",

1190

"Ccedil\0\xc7",

1191

"Egrave\0\xc8",

1192

"Eacute\0\xc9",

1193

"Ecirc\0\xca",

1194

"Euml\0\xcb",

1195

"Igrave\0\xcc",

1196

"Iacute\0\xcd",

1197

"Icirc\0\xce",

1198

"Iuml\0\xcf",

1199

"ETH\0\xd0",

1200

"Ntilde\0\xd1",

1201

"Ograve\0\xd2",

1202

"Oacute\0\xd3",

1203

"Ocirc\0\xd4",

1204

"Otilde\0\xd5",

1205

"Ouml\0\xd6",

1206

"times\0\xd7",

1207

"Oslash\0\xd8",

1208

"Ugrave\0\xd9",

1209

"Uacute\0\xda",

1210

"Ucirc\0\xdb",

1211

"Uuml\0\xdc",

1212

"Yacute\0\xdd",

1213

"THORN\0\xde",

1214

"szlig\0\xdf",

1215

"agrave\0\xe0",

1216

"aacute\0\xe1",

1217

"acirc\0\xe2",

1218

"atilde\0\xe3",

1219

"auml\0\xe4",

1220

"aring\0\xe5",

1221

"aelig\0\xe6",

1222

"ccedil\0\xe7",

1223

"egrave\0\xe8",

1224

"eacute\0\xe9",

1225

"ecirc\0\xea",

1226

"euml\0\xeb",

1227

"igrave\0\xec",

1228

"iacute\0\xed",

1229

"icirc\0\xee",

1230

"iuml\0\xef",

1231

"eth\0\xf0",

1232

"ntilde\0\xf1",

1233

"ograve\0\xf2",

1234

"oacute\0\xf3",

1235

"ocirc\0\xf4",

1236

"otilde\0\xf5",

1237

"ouml\0\xf6",

1238

"divide\0\xf7",

1239

"oslash\0\xf8",

1240

"ugrave\0\xf9",

1241

"uacute\0\xfa",

1242

"ucirc\0\xfb",

1243

"uuml\0\xfc",

1244

"yacute\0\xfd",

1245

"thorn\0\xfe",

1246

"yuml\0\xff",

1247

"Yuml\0Y",

1248

"itilde\0i",

1249

"Itilde\0I",

1250

"utilde\0u",

1251

"Utilde\0U",

1252

"edot\0e",

1253

"nbsp\0 ",

1254

"frac13\01/3",

1255

"frac23\02/3",

1256

"plusmn\0+-",

1257

"laquo\0left arrow",

1258

"#171\0left arrow",

1259

"raquo\0arrow",

1260

"#187\0arrow",

1261

"micro\0micro",

1262

"trade\0(TM)",

1263

"hellip\0...",

1264

"#275\0`",

1265

"#913\0Alpha",

1266

"#914\0Beta",

1267

"#915\0Gamma",

1268

"#916\0Delta",

1269

"#917\0Epsilon",

1270

"#918\0Zeta",

1271

"#919\0Eta",

1272

"#920\0Theta",

1273

"#921\0Iota",

1274

"#922\0Kappa",

1275

"#923\0Lambda",

1276

"#924\0Mu",

1277

"#925\0Nu",

1278

"#926\0Xi",

1279

"#927\0Omicron",

1280

"#928\0Pi",

1281

"#929\0Rho",

1282

"#931\0Sigma",

1283

"#932\0Tau",

1284

"#933\0Upsilon",

1285

"#934\0Phi",

1286

"#935\0Chi",

1287

"#936\0Psi",

1288

"#937\0Omega",

1289

"#945\0alpha",

1290

"#946\0beta",

1291

"#947\0gamma",

1292

"#948\0delta",

1293

"#949\0epsilon",

1294

"#950\0zeta",

1295

"#951\0eta",

1296

"#952\0theta",

1297

"#953\0iota",

1298

"#954\0kappa",

1299

"#955\0lambda",

1300

"#956\0mu",

1301

"#957\0nu",

1302

"#958\0xi",

1303

"#959\0omicron",

1304

"#960\0pi",

1305

"#961\0rho",

1306

"#962\0sigmaf",

1307

"#963\0sigma",

1308

"#964\0tau",

1309

"#965\0upsilon",

1310

"#966\0phi",

1311

"#967\0chi",

1312

"#968\0psi",

1313

"#969\0omega",

1314

"#177\0+-",

1315

"#8211\0-",

1316

"#8212\0 - ",

1317

"#8216\0`",

1318

"#8217\0'",

1319

"#8220\0`",

1320

"#8221\0'",

1321

"bull\0*",

1322

"#8226\0*",

1323

"#8230\0...",

1324

"#8242\0prime",

1325

"#8501\0aleph",

1326

"#8592\0left arrow",

1327

"#8593\0up arrow",

1328

"#8594\0arrow",

1329

"#8595\0down arrow",

1330

"#8660\0double arrow",

1331

"#8704\0every",

1332

"#8706\0d",

1333

"#8707\0some",

1334

"#8709\0empty set",

1335

"#8711\0del",

1336

"#8712\0member of",

1337

"#8713\0not a member of",

1338

"#8717\0such that",

1339

"#8721\0sum",

1340

"#8734\0infinity",

1341

"#8736\0angle",

1342

"#8745\0intersect",

1343

"#8746\0union",

1344

"#8747\0integral",

1345

"#8773\0congruent to",

1346

"#8800\0not equal",

1347

"#8804\0<=",

1348

"#8805\0>=",

1349

"#8834\0proper subset of",

1350

"#8835\0proper superset of",

1351

"#8836\0not a subset of",

1352

"#8838\0subset of",

1353

"#8839\0superset of",

1354

"#9658\0*",

1355

1356

};

1357

1358

if(!s)

1359

return 0;

1360

if(s == EMPTYSTRING)

1361

return EMPTYSTRING;

1362

new = initString(&l);

1363

1364

while(c = *s) {

1365

if(c == InternalCodeChar && !invisible) {

1366

const char *t = s + 1;

1367

while(isdigitByte(*t))

1368

++t;

1369

if(t > s + 1 && *t && strchr("{}<>*", *t)) { /* it's a tag */

1370

bool separate, pretag, slash;

1371

n = atoi(s + 1);

1372

preFormatCheck(n, &pretag, &slash);

1373

separate = (*t != '*');

1374

if(separate)

1375

alnum = 0;

1376

debugPrint(7, "tag %d%c separate %d", n, *t, separate);

1377

if(pretag)

1378

premode = !slash;

1379

++t;

1380

stringAndBytes(&new, &l, s, t - s);

1381

s = t;

1382

continue;

1383

} /* tag */

1384

}

1385

/* code */

1386

j = 1;

1387

if(c != '&')

1388

goto putc;

1389

1390

for(j = 0; j < sizeof (andbuf); ++j) {

1391

d = s[j + 1];

1392

if(d == '&' || d == ';' || d <= ' ')

1393

break;

1394

}

1395

if(j == sizeof (andbuf))

1396

goto putc; /* too long, no match */

1397

strncpy(andbuf, s + 1, j);

1398

andbuf[j] = 0;

1399

++j;

1400

if(s[j] == ';')

1401

++j;

1402

/* remove leading zeros */

1403

if(andbuf[0] == '#')

1404

while(andbuf[1] == '0')

1405

strcpy(andbuf + 1, andbuf + 2);

1406

1407

lookup:

1408

debugPrint(6, "meta %s", andbuf);

1409

n = stringInList(andwords, andbuf);

1410

if(n >= 0) { /* match */

1411

const char *r = andwords[n] + strlen(andwords[n]) + 1; /* replacement string */

1412

s += j;

1413

if(!r[1]) { /* replace with a single character */

1414

c = *r;

1415

if(c & 0x80 && cons_utf8) {

1416

static char utfbuf[4];

1417

utfbuf[0] = (0xc0 | ((uchar) c >> 6));

1418

utfbuf[1] = (0x80 | (c & 0x3f));

1419

utfbuf[2] = 0;

1420

r = utfbuf;

1421

goto putw;

1422

}

1423

--s;

1424

goto putc;

1425

}

1426

if(invisible) {

1427

s -= j;

1428

goto putc;

1429

}

1430

/* We're replacing with a word */

1431

if(!invisible && isalnumByte(*r)) {

1432

/* insert spaces either side */

1433

if(alnum)

1434

stringAndChar(&new, &l, ' ');

1435

alnum = 2;

1436

} else

1437

alnum = 0;

1438

putw:

1439

stringAndString(&new, &l, r);

1440

continue;

1441

}

1442

/* match */

1443

if(andbuf[0] != '#')

1444

goto putc;

1445

n = stringIsNum(andbuf + 1);

1446

if(n < 0)

1447

goto putc;

1448

if(n > 255)

1449

goto putc;

1450

c = n;

1451

/* don't allow nulls */

1452

if(c == 0)

1453

c = ' ';

1454

if(strchr("\r\n\f", c) && !premode)

1455

c = ' ';

1456

if(c == InternalCodeChar)

1457

c = ' ';

1458

s += j - 1;

1459

j = 1;

1460

1461

putc:

1462

if(isalnumByte(c)) {

1463

if(alnum == 2)

1464

stringAndChar(&new, &l, ' ');

1465

alnum = 1;

1466

} else

1467

alnum = 0;

1468

stringAndChar(&new, &l, c);

1469

++s;

1470

} /* loop over input string */

1471

1472

return new;

1473

} /* andTranslate */

1474

1475

/*********************************************************************

1476

Crunch a to-list or a copy-to-list down to its email addresses.

1477

Delimit them with newlines.

1478

"Smith, John" <jsmith@whatever.com>

1479

becomes

1480

jsmith@whatever.com

1481

*********************************************************************/

1482

1483

void

1484

extractEmailAddresses(char *line)

1485

{

1486

char *s, *t;

1487

char *mark; /* start of current entry */

1488

char quote = 0, c;

1489

1490

for(s = t = mark = line; c = *s; ++s) {

1491

if(c == ',' && !quote) {

1492

mark = t + 1;

1493

c = ' ';

1494

goto append;

1495

}

1496

1497

if(c == '"') {

1498

if(!quote)

1499

quote = c;

1500

else if(quote == c)

1501

quote = 0;

1502

/* don't think you can quote in an email address */

1503

continue;

1504

}

1505

1506

if(c == '<') {

1507

if(!quote) {

1508

quote = c;

1509

t = mark;

1510

}

1511

continue;

1512

}

1513

1514

if(c == '>') {

1515

if(quote == '<')

1516

quote = 0;

1517

continue;

1518

}

1519

1520

if(quote == '"')

1521

continue;

1522

1523

if(c < ' ')

1524

c = ' ';

1525

if(c == ' ' && quote == '<')

1526

c = '_';

1527

1528

append:

1529

*t++ = c;

1530

}

1531

1532

*t = 0;

1533

spaceCrunch(line, true, false);

1534

for(s = line; c = *s; ++s)

1535

if(c == ' ')

1536

*s = ',';

1537

if(*line)

1538

strcat(line, ",");

1539

} /* extractEmailAddresses */

1540

1541

static void

1542

cutDuplicateEmail(char *line, const char *dup, int duplen)

1543

{

1544

char *s;

1545

while(*line) {

1546

s = strchr(line, ',');

1547

if(!s)

1548

return; /* should never happen */

1549

if(duplen == s - line && memEqualCI(line, dup, duplen)) {

1550

++s;

1551

strcpy(line, s);

1552

continue;

1553

}

1554

line = s + 1;

1555

}

1556

} /* cutDuplicateEmail */

1557

1558

void

1559

cutDuplicateEmails(char *tolist, char *cclist, const char *reply)

1560

{

1561

int len;

1562

char *s, *t;

1563

1564

len = strlen(reply);

1565

if(len) {

1566

cutDuplicateEmail(tolist, reply, len);

1567

cutDuplicateEmail(cclist, reply, len);

1568

}

1569

1570

s = tolist;

1571

while(*s) {

1572

t = strchr(s, ',');

1573

if(!t)

1574

break; /* should never happen */

1575

len = t - s;

1576

++t;

1577

cutDuplicateEmail(t, s, len);

1578

cutDuplicateEmail(cclist, s, len);

1579

s = t;

1580

}

1581

1582

s = cclist;

1583

while(*s) {

1584

t = strchr(s, ',');

1585

if(!t)

1586

break; /* should never happen */

1587

len = t - s;

1588

++t;

1589

cutDuplicateEmail(t, s, len);

1590

s = t;

1591

}

1592

1593

/* If your email address is on the to or cc list, drop it.

1594

* But retain it if it is the reply, in case you sent mail to yourself. */

1595

if(reply[0]) {

1596

struct MACCOUNT *m = accounts;

1597

int i;

1598

for(i = 0; i < maxAccount; ++i, ++m) {

1599

const char *r = m->reply;

1600

if(!r)

1601

continue;

1602

len = strlen(r);

1603

cutDuplicateEmail(tolist, r, len);

1604

cutDuplicateEmail(cclist, r, len);

1605

}

1606

}

1607

} /* cutDuplicateEmails */

1608

1609

/*********************************************************************

1610

We got some data, from a file or from the internet.

1611

Count the binary characters and decide if this is, on the whole,

1612

binary or text. I allow some nonascii chars,

1613

like you might see in Spanish or German, and still call it text,

1614

but if there's too many such chars, I call it binary.

1615

It's not an exact science.

1616

*********************************************************************/

1617

1618

bool

1619

looksBinary(const char *buf, int buflen)

1620

{

1621

int i, bincount = 0;

1622

for(i = 0; i < buflen; ++i) {

1623

char c = buf[i];

1624

if(c <= 0)

1625

++bincount;

1626

}

1627

return (bincount * 4 - 10 >= buflen);

1628

} /* looksBinary */

1629

1630

void

1631

looks_8859_utf8(const char *buf, int buflen, bool * iso_p, bool * utf8_p)

1632

{

1633

int utfcount = 0, isocount = 0;

1634

int i, j, bothcount;

1635

1636

for(i = 0; i < buflen; ++i) {

1637

char c = buf[i];

1638

if(c >= 0)

1639

continue;

1640

/* This is the start of the nonascii sequence. */

1641

/* No second bit, it has to be iso. */

1642

if(!(c & 0x40)) {

1643

isogo:

1644

++isocount;

1645

continue;

1646

}

1647

/* Next byte has to start with 10 to be utf8, else it's iso */

1648

if(((uchar) buf[i + 1] & 0xc0) != 0x80)

1649

goto isogo;

1650

c <<= 2;

1651

for(j = i + 2; c < 0; ++j, c <<= 1)

1652

if(((uchar) buf[j] & 0xc0) != 0x80)

1653

goto isogo;

1654

++utfcount;

1655

i = j - 1;

1656

}

1657

1658

*iso_p = *utf8_p = false;

1659

1660

bothcount = isocount + utfcount;

1661

if(!bothcount)

1662

return; /* ascii */

1663

bothcount *= 6;

1664

if(utfcount * 7 >= bothcount)

1665

*utf8_p = true;

1666

if(isocount * 7 >= bothcount)

1667

*iso_p = true;

1668

} /* looks_8859_utf8 */

1669

1670

/*********************************************************************

1671

Convert a string from iso 8859 to utf8, or vice versa.

1672

In each case a new string is allocated.

1673

Don't forget to free it when you're done.

1674

*********************************************************************/

1675

1676

void

1677

iso2utf(const char *inbuf, int inbuflen, char **outbuf_p, int *outbuflen_p)

1678

{

1679

int i, j;

1680

int nacount = 0;

1681

char c;

1682

char *outbuf;

1683

1684

if(!inbuflen) {

1685

*outbuf_p = EMPTYSTRING;

1686

*outbuflen_p = 0;

1687

return;

1688

}

1689

1690

/* count chars, so we can allocate */

1691

for(i = 0; i < inbuflen; ++i) {

1692

c = inbuf[i];

1693

if(c < 0)

1694

++nacount;

1695

}

1696

1697

outbuf = allocMem(inbuflen + nacount + 1);

1698

for(i = j = 0; i < inbuflen; ++i) {

1699

c = inbuf[i];

1700

if(c >= 0) {

1701

outbuf[j++] = c;

1702

continue;

1703

}

1704

outbuf[j++] = ((uchar) c >> 6) | 0xc0;

1705

outbuf[j++] = (c & 0x3f) | 0x80;

1706

}

1707

outbuf[j] = 0; /* just for fun */

1708

1709

*outbuf_p = outbuf;

1710

*outbuflen_p = j;

1711

} /* iso2utf */

1712

1713

void

1714

utf2iso(const char *inbuf, int inbuflen, char **outbuf_p, int *outbuflen_p)

1715

{

1716

int i, j;

1717

char c;

1718

char *outbuf;

1719

1720

if(!inbuflen) {

1721

*outbuf_p = EMPTYSTRING;

1722

*outbuflen_p = 0;

1723

return;

1724

}

1725

1726

outbuf = allocMem(inbuflen + 1);

1727

for(i = j = 0; i < inbuflen; ++i) {

1728

c = inbuf[i];

1729

1730

/* regular chars and nonascii chars that aren't utf8 pass through. */

1731

/* There shouldn't be any of the latter */

1732

if(((uchar) c & 0xc0) != 0xc0) {

1733

outbuf[j++] = c;

1734

continue;

1735

}

1736

1737

/* Convertable into 8 bit */

1738

if(((uchar) c & 0xfc) == 0xc0 && ((uchar) inbuf[i + 1] & 0xc0) == 0x80) {

1739

outbuf[j++] = ((uchar) c << 6) | (inbuf[i + 1] & 0x3f);

1740

++i;

1741

continue;

1742

}

1743

1744

/* Higher unicodes, more than 2 bytes, are converted into 0x80 */

1745

c <<= 1;

1746

++i;

1747

for(++i; c < 0; ++i, c <<= 1) {

1748

if(((uchar) outbuf[i] & 0xc0) != 0x80)

1749

break;

1750

}

1751

outbuf[j++] = 0x80;

1752

--i;

1753

}

1754

outbuf[j] = 0; /* just for fun */

1755

1756

*outbuf_p = outbuf;

1757

*outbuflen_p = j;

1758

} /* utf2iso */

1759

1760

void

1761

iuReformat(const char *inbuf, int inbuflen, char **outbuf_p, int *outbuflen_p)

1762

{

1763

bool is8859, isutf8;

1764

1765

*outbuf_p = 0;

1766

*outbuflen_p = 0;

1767

if(!iuConvert)

1768

return;

1769

1770

looks_8859_utf8(inbuf, inbuflen, &is8859, &isutf8);

1771

if(cons_utf8 && is8859) {

1772

debugPrint(3, "converting to utf8");

1773

iso2utf(inbuf, inbuflen, outbuf_p, outbuflen_p);

1774

}

1775

if(!cons_utf8 && isutf8) {

1776

debugPrint(3, "converting to iso8859");

1777

utf2iso(inbuf, inbuflen, outbuf_p, outbuflen_p);

1778

}

1779

} /* iuReformat */

Older »