~ubuntu-branches/ubuntu/trusty/hyphen/trusty-proposed

« back to all changes in this revision

Viewing changes to hyphen.c

Committer: Bazaar Package Importer
Author(s): Chris Cheney
Date: 2008-07-01 20:44:19 UTC
mfrom: (1.1.2 upstream)
Revision ID: james.westby@ubuntu.com-20080701204419-x15zd2n5ei0yvqbk

Tags: 2.4-2ubuntu1

* Merge from debian unstable, remaining changes:
- Replaces old version of openoffice.org-hyphenation.

files added:
README.compound

debian/libhyphen0.shlibs

debian/patches/02_encds_static.dpatch

hyph_en_US.dic

tbhyphext.sh

tbhyphext.tex

tests/compound.hyph

tests/compound.pat

tests/compound.test

tests/compound.word

tests/compound2.hyph

tests/compound2.pat

tests/compound2.test

tests/compound2.word

tests/compound3.hyph

tests/compound3.pat

tests/compound3.test

tests/compound3.word

tests/compound4.hyph

tests/compound4.pat

tests/compound4.test

tests/compound4.word

tests/settings.hyph

tests/settings.pat

tests/settings.test

tests/settings.word

tests/settings2.hyph

tests/settings2.pat

tests/settings2.test

tests/settings2.word

tests/settings3.hyph

tests/settings3.pat

tests/settings3.test

tests/settings3.word

tests/settings4.hyph

tests/settings4.pat

tests/settings4.test

tests/settings4.word

files removed:
hyphen.mashed

files modified:
AUTHORS

ChangeLog

Makefile.am

Makefile.in

NEWS

README

README.nonstandard

TODO

configure

configure.in

debian/changelog

debian/patches/00list

debian/patches/01_hyphen_tex_from_TeXLive.dpatch

example.c

hyphen.c

hyphen.h

substrings.pl

tests/Makefile.am

tests/Makefile.in

tests/alt.pat

tests/alt2.pat

tests/alt3.pat

tests/alt4.pat

tests/base.hyph

tests/base.pat

tests/conv.pat

tests/test.sh

Show diffs side-by-side

added added

removed removed

hyphen.c

* This library is free software; you can redistribute it and/or

* modify it under the terms of the GNU Library General Public

233

HyphenDict *

234

hnj_hyphen_load (const char *fn)

235

{

236

HyphenDict *dict;

236

HyphenDict *dict[2];

237

HashTab *hashtab;

238

FILE *f;

239

char buf[MAX_CHARS];

243

signed char replindex;

244

signed char replcut;

245

int state_num = 0, last_state;

246

int i, j;

246

int i, j, k;

247

char ch;

248

int found;

249

HashEntry *e;

250

int nextlevel = 0;

250

251

252

f = fopen (fn, "r");

252

253

if (f == NULL)

253

254

return NULL;

254

255

256

// loading one or two dictionaries (separated by NEXTLEVEL keyword)

257

for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {

255

258

hashtab = hnj_hash_new ();

256

259

#ifdef VERBOSE

257

260

global = hashtab;

258

261

#endif

259

262

hnj_hash_insert (hashtab, "", 0);

260

261

dict = hnj_malloc (sizeof(HyphenDict));

262

dict->num_states = 1;

263

dict->states = hnj_malloc (sizeof(HyphenState));

264

dict->states[0].match = NULL;

265

dict->states[0].repl = NULL;

266

dict->states[0].fallback_state = -1;

267

dict->states[0].num_trans = 0;

268

dict->states[0].trans = NULL;

263

dict[k] = hnj_malloc (sizeof(HyphenDict));

264

dict[k]->num_states = 1;

265

dict[k]->states = hnj_malloc (sizeof(HyphenState));

266

dict[k]->states[0].match = NULL;

267

dict[k]->states[0].repl = NULL;

268

dict[k]->states[0].fallback_state = -1;

269

dict[k]->states[0].num_trans = 0;

270

dict[k]->states[0].trans = NULL;

271

dict[k]->nextlevel = NULL;

272

dict[k]->lhmin = 0;

273

dict[k]->rhmin = 0;

274

dict[k]->clhmin = 0;

275

dict[k]->crhmin = 0;

269

276

270

277

/* read in character set info */

271

for (i=0;i<MAX_NAME;i++) dict->cset[i]= 0;

272

fgets(dict->cset, sizeof(dict->cset),f);

273

for (i=0;i<MAX_NAME;i++)

274

if ((dict->cset[i] == '\r') || (dict->cset[i] == '\n'))

275

dict->cset[i] = 0;

276

dict->utf8 = (strcmp(dict->cset, "UTF-8") == 0);

278

if (k == 0) {

279

for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;

280

fgets(dict[k]->cset, sizeof(dict[k]->cset),f);

281

for (i=0;i<MAX_NAME;i++)

282

if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))

283

dict[k]->cset[i] = 0;

284

dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);

285

} else {

286

strcpy(dict[k]->cset, dict[0]->cset);

287

dict[k]->utf8 = dict[0]->utf8;

288

}

277

289

278

290

while (fgets (buf, sizeof(buf), f) != NULL)

279

291

{

280

292

if (buf[0] != '%')

281

293

{

294

if (strncmp(buf, "NEXTLEVEL", 9) == 0) {

295

nextlevel = 1;

296

break;

297

} else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {

298

dict[k]->lhmin = atoi(buf + 13);

299

continue;

300

} else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {

301

dict[k]->rhmin = atoi(buf + 14);

302

continue;

303

} else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {

304

dict[k]->clhmin = atoi(buf + 21);

305

continue;

306

} else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {

307

dict[k]->crhmin = atoi(buf + 22);

308

continue;

309

}

282

310

j = 0;

283

311

pattern[j] = '0';

284

312

repl = strchr(buf, '/');

322

350

} else {

323

351

if (*word == '.') i++;

324

352

/* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */

325

if (dict->utf8) {

353

if (dict[k]->utf8) {

326

354

int pu = -1; /* unicode character position */

327

355

int ps = -1; /* unicode start position (original replindex) */

328

356

int pc = (*word == '.') ? 1: 0; /* 8-bit character position */

346

374

printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl);

347

375

#endif

348

376

found = hnj_hash_lookup (hashtab, word);

349

state_num = hnj_get_state (dict, hashtab, word);

350

dict->states[state_num].match = hnj_strdup (pattern + i);

351

dict->states[state_num].repl = repl;

352

dict->states[state_num].replindex = replindex;

377

state_num = hnj_get_state (dict[k], hashtab, word);

378

dict[k]->states[state_num].match = hnj_strdup (pattern + i);

379

dict[k]->states[state_num].repl = repl;

380

dict[k]->states[state_num].replindex = replindex;

353

381

if (!replcut) {

354

dict->states[state_num].replcut = strlen(word);

382

dict[k]->states[state_num].replcut = strlen(word);

355

383

} else {

356

dict->states[state_num].replcut = replcut;

384

dict[k]->states[state_num].replcut = replcut;

357

385

}

358

386

359

387

/* now, put in the prefix transitions */

363

391

ch = word[j - 1];

364

392

word[j - 1] = '\0';

365

393

found = hnj_hash_lookup (hashtab, word);

366

state_num = hnj_get_state (dict, hashtab, word);

367

hnj_add_trans (dict, state_num, last_state, ch);

394

state_num = hnj_get_state (dict[k], hashtab, word);

395

hnj_add_trans (dict[k], state_num, last_state, ch);

368

396

}

369

397

}

370

398

}

399

427

}

400

428

/* KBH: FIXME state 0 fallback_state should always be -1? */

401

429

if (e->val)

402

dict->states[e->val].fallback_state = state_num;

430

dict[k]->states[e->val].fallback_state = state_num;

403

431

}

404

432

#ifdef VERBOSE

405

433

for (i = 0; i < HASH_SIZE; i++)

406

434

for (e = hashtab->entries[i]; e; e = e->next)

407

435

{

408

436

printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,

409

dict->states[e->val].fallback_state);

410

for (j = 0; j < dict->states[e->val].num_trans; j++)

411

printf (" %c->%d\n", dict->states[e->val].trans[j].ch,

412

dict->states[e->val].trans[j].new_state);

437

dict[k]->states[e->val].fallback_state);

438

for (j = 0; j < dict[k]->states[e->val].num_trans; j++)

439

printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,

440

dict[k]->states[e->val].trans[j].new_state);

413

441

}

414

442

#endif

415

443

416

444

#ifndef VERBOSE

417

445

hnj_hash_free (hashtab);

418

446

#endif

419

420

return dict;

447

state_num = 0;

448

}

449

fclose(f);

450

if (k == 2) dict[0]->nextlevel = dict[1];

451

return dict[0];

421

452

}

422

453

423

454

void hnj_hyphen_free (HyphenDict *dict)

435

466

if (hstate->trans)

436

467

hnj_free (hstate->trans);

437

468

}

469

if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);

438

470

439

471

hnj_free (dict->states);

440

472

563

595

564

596

if (prep_word != prep_word_buf)

565

597

hnj_free (prep_word);

598

566

599

return 0;

567

600

}

568

601

569

int hnj_hyphen_hyphenate2 (HyphenDict *dict,

570

const char *word, int word_size, char * hyphens,

571

char *hyphword, char *** rep, int ** pos, int ** cut)

602

/* character length of the first n byte of the input word */

603

int hnj_hyphen_strnlen(const char * word, int n, int utf8)

604

{

605

int i = 0;

606

int j = 0;

607

while (j < n && word[j] != '\0') {

608

i++;

609

for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);

610

}

611

return i;

612

}

613

614

int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,

615

char *** rep, int ** pos, int ** cut, int lhmin)

616

{

617

int i, j;

618

for (i = 1, j = 0; i < lhmin && word[j] != '\0'; i++) do {

619

// check length of the non-standard part

620

if (*rep && *pos && *cut && (*rep)[j]) {

621

char * rh = strchr((*rep)[j], '=');

622

if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +

623

hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {

624

free((*rep)[j]);

625

(*rep)[j] = NULL;

626

hyphens[j] = '0';

627

}

628

} else {

629

hyphens[j] = '0';

630

}

631

j++;

632

} while (utf8 && (word[j + 1] & 0xc0) == 0xc0);

633

return 0;

634

}

635

636

int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,

637

char *** rep, int ** pos, int ** cut, int rhmin)

638

{

639

int i;

640

int j = word_size - 2;

641

for (i = 1; i < rhmin && j > 0; j--) {

642

// check length of the non-standard part

643

if (*rep && *pos && *cut && (*rep)[j]) {

644

char * rh = strchr((*rep)[j], '=');

645

if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +

646

hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {

647

free((*rep)[j]);

648

(*rep)[j] = NULL;

649

hyphens[j] = '0';

650

}

651

} else {

652

hyphens[j] = '0';

653

}

654

if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;

655

}

656

return 0;

657

}

658

659

// recursive function for compound level hyphenation

660

int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,

661

char * hyphens, char *** rep, int ** pos, int ** cut,

662

int clhmin, int crhmin, int lend, int rend)

572

663

{

573

664

char prep_word_buf[MAX_WORD];

574

665

char *prep_word;

709

800

putchar ('\n');

710

801

#endif

711

802

712

for (i = 0; i < j - 4; i++)

803

for (i = 0; i < j - 3; i++)

713

804

#if 0

714

805

if (hyphens[i + 1] & 1)

715

806

hyphens[i] = '-';

716

807

#else

717

808

hyphens[i] = hyphens[i + 1];

718

809

#endif

719

hyphens[0] = '0';

720

810

for (; i < word_size; i++)

721

811

hyphens[i] = '0';

722

812

hyphens[word_size] = '\0';

723

813

724

725

if (prep_word != prep_word_buf) {

726

hnj_free (prep_word);

727

}

728

729

730

814

/* now create a new char string showing hyphenation positions */

731

/* count the hyphens and allocate space for the new hypehanted string */

815

/* count the hyphens and allocate space for the new hyphenated string */

732

816

nHyphCount = 0;

733

817

for (i = 0; i < word_size; i++)

734

818

if (hyphens[i]&1)

752

836

(*pos)[matchindex[i] - 1] = matchindex[i] - i;

753

837

(*cut)[matchindex[i] - 1] = matchlen[i];

754

838

}

755

if (hyphword) strcpy(hyphword + j, matchrepl[matchindex[i]]);

756

839

j += strlen(matchrepl[matchindex[i]]);

757

840

i += matchlen[i] - 1;

758

} else if (hyphword) {

759

hyphword[j++] = word[i];

760

if ((hyphens[i]&1) && !(isrepl && ((i+1) < word_size) &&

761

(matchindex[i+1] >= 0) && matchrepl[matchindex[i+1]])) hyphword[j++] = '=';

762

841

}

763

842

}

764

843

765

if (hyphword) hyphword[j] = '\0';

766

767

844

if (matchrepl != matchrepl_buf) {

768

845

hnj_free (matchrepl);

769

846

hnj_free (matchlen);

770

847

hnj_free (matchindex);

771

848

}

772

849

773

if (!(dict->utf8)) return 0;

774

850

// recursive hyphenation of the first (compound) level segments

851

if (dict->nextlevel) {

852

char * rep2_buf[MAX_WORD];

853

int pos2_buf[MAX_WORD];

854

int cut2_buf[MAX_WORD];

855

char hyphens2_buf[MAX_WORD];

856

char ** rep2;

857

int * pos2;

858

int * cut2;

859

char * hyphens2;

860

int begin = 0;

861

if (word_size < MAX_CHARS) {

862

rep2 = rep2_buf;

863

pos2 = pos2_buf;

864

cut2 = cut2_buf;

865

hyphens2 = hyphens2_buf;

866

} else {

867

rep2 = hnj_malloc (word_size * sizeof(char *));

868

pos2 = hnj_malloc (word_size * sizeof(int));

869

cut2 = hnj_malloc (word_size * sizeof(int));

870

hyphens2 = hnj_malloc (word_size);

871

}

872

for (i = 0; i < word_size; i++) rep2[i] = NULL;

873

for (i = 0; i < word_size; i++) if

874

(hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {

875

if (i - begin > 1) {

876

int hyph = 0;

877

prep_word[i + 2] = '\0';

878

/* non-standard hyphenation at compound boundary (Schiffahrt) */

879

if (*rep && *pos && *cut && (*rep)[i]) {

880

char * l = strchr((*rep)[i], '=');

881

strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);

882

if (l) {

883

hyph = (l - (*rep)[i]) - (*pos)[i];

884

prep_word[2 + i + hyph] = '\0';

885

}

886

}

887

hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,

888

hyphens2, &rep2, &pos2, &cut2, clhmin,

889

crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));

890

for (j = 0; j < i - begin - 1; j++) {

891

hyphens[begin + j] = hyphens2[j];

892

if (rep2[j] && rep && pos && cut) {

893

if (!*rep && !*pos && !*cut) {

894

int k;

895

*rep = (char **) malloc(sizeof(char *) * word_size);

896

*pos = (int *) malloc(sizeof(int) * word_size);

897

*cut = (int *) malloc(sizeof(int) * word_size);

898

for (k = 0; k < word_size; k++) {

899

(*rep)[k] = NULL;

900

(*pos)[k] = 0;

901

(*cut)[k] = 0;

902

}

903

}

904

(*rep)[begin + j] = rep2[j];

905

(*pos)[begin + j] = pos2[j];

906

(*cut)[begin + j] = cut2[j];

907

}

908

}

909

prep_word[i + 2] = word[i + 1];

910

if (*rep && *pos && *cut && (*rep)[i]) {

911

strcpy(prep_word + 1, word);

912

}

913

}

914

begin = i + 1;

915

for (j = 0; j < word_size; j++) rep2[j] = NULL;

916

}

917

918

// non-compound

919

if (begin == 0) {

920

hnj_hyphen_hyph_(dict->nextlevel, word, word_size,

921

hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);

922

if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,

923

rep, pos, cut, clhmin);

924

if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,

925

rep, pos, cut, crhmin);

926

}

927

928

if (rep2 != rep2_buf) {

929

free(rep2);

930

free(cut2);

931

free(pos2);

932

free(hyphens2);

933

}

934

}

935

936

if (prep_word != prep_word_buf) hnj_free (prep_word);

937

return 0;

938

}

939

940

/* UTF-8 normalization of hyphen and non-standard positions */

941

int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,

942

char *** rep, int ** pos, int ** cut)

943

{

775

944

if ((((unsigned char) word[0]) >> 6) == 2) {

776

945

fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);

777

946

return 1;

778

947

}

779

948

780

949

/* calculate UTF-8 character positions */

781

j = -1;

782

for (i = 0; i < word_size; i++) {

950

int i, j, k;

951

for (i = 0, j = -1; i < word_size; i++) {

783

952

/* beginning of an UTF-8 character (not '10' start bits) */

784

953

if ((((unsigned char) word[i]) >> 6) != 2) j++;

785

954

hyphens[j] = hyphens[i];

806

975

hyphens[j + 1] = '\0';

807

976

return 0;

808

977

}

978

979

/* get the word with all possible hyphenations (output: hyphword) */

980

void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,

981

char * hyphword, char *** rep, int ** pos, int ** cut)

982

{

983

int i, j;

984

for (i = 0, j = 0; i < l; i++, j++) {

985

if (hyphens[i]&1) {

986

hyphword[j] = word[i];

987

if (*rep && *pos && *cut && (*rep)[i]) {

988

strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);

989

j += strlen((*rep)[i]) - (*pos)[i];

990

i += (*cut)[i] - (*pos)[i];

991

} else hyphword[++j] = '=';

992

} else hyphword[j] = word[i];

993

}

994

hyphword[j] = '\0';

995

}

996

997

998

/* main api function with default hyphenmin parameters */

999

int hnj_hyphen_hyphenate2 (HyphenDict *dict,

1000

const char *word, int word_size, char * hyphens,

1001

char *hyphword, char *** rep, int ** pos, int ** cut)

1002

{

1003

hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,

1004

dict->clhmin, dict->crhmin, 1, 1);

1005

hnj_hyphen_lhmin(dict->utf8, word, word_size,

1006

hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));

1007

hnj_hyphen_rhmin(dict->utf8, word, word_size,

1008

hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));

1009

if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);

1010

if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);

1011

return 0;

1012

}

1013

1014

/* previous main api function with hyphenmin parameters */

1015

int hnj_hyphen_hyphenate3 (HyphenDict *dict,

1016

const char *word, int word_size, char * hyphens,

1017

char *hyphword, char *** rep, int ** pos, int ** cut,

1018

int lhmin, int rhmin, int clhmin, int crhmin)

1019

{

1020

lhmin = (lhmin > 0 ? lhmin : dict->lhmin);

1021

rhmin = (rhmin > 0 ? rhmin : dict->rhmin);

1022

hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,

1023

clhmin, crhmin, 1, 1);

1024

hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,

1025

rep, pos, cut, (lhmin > 0 ? lhmin : 2));

1026

hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,

1027

rep, pos, cut, (rhmin > 0 ? rhmin : 2));

1028

if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);

1029

if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);

1030

return 0;

1031

}

Older »