1
//========================================================================
3
// CharCodeToUnicode.cc
5
// Copyright 2001-2003 Glyph & Cog, LLC
7
//========================================================================
9
//========================================================================
11
// Modified under the Poppler project - http://poppler.freedesktop.org
13
// All changes made under the Poppler project to this file are licensed
14
// under GPL version 2 or later
16
// Copyright (C) 2006, 2008-2010 Albert Astals Cid <aacid@kde.org>
17
// Copyright (C) 2007 Julien Rebetez <julienr@svn.gnome.org>
18
// Copyright (C) 2007 Koji Otani <sho@bbr.jp>
19
// Copyright (C) 2008 Michael Vrable <mvrable@cs.ucsd.edu>
20
// Copyright (C) 2008 Vasile Gaburici <gaburici@cs.umd.edu>
21
// Copyright (C) 2010 William Bader <williambader@hotmail.com>
22
// Copyright (C) 2010 Jakub Wilk <ubanus@users.sf.net>
24
// To see a description of the changes please see the Changelog file that
25
// came with your tarball or type make ChangeLog if you are building from git
27
//========================================================================
31
#ifdef USE_GCC_PRAGMAS
32
#pragma implementation
38
#include "goo/gfile.h"
39
#include "goo/GooLikely.h"
40
#include "goo/GooString.h"
42
#include "GlobalParams.h"
43
#include "PSTokenizer.h"
44
#include "CharCodeToUnicode.h"
46
//------------------------------------------------------------------------
48
struct CharCodeToUnicodeString {
54
//------------------------------------------------------------------------
56
static int getCharFromString(void *data) {
70
static int getCharFromFile(void *data) {
71
return fgetc((FILE *)data);
74
//------------------------------------------------------------------------
76
CharCodeToUnicode *CharCodeToUnicode::parseCIDToUnicode(GooString *fileName,
77
GooString *collection) {
80
CharCode size, mapLenA;
83
CharCodeToUnicode *ctu;
85
if (!(f = fopen(fileName->getCString(), "r"))) {
86
error(-1, "Couldn't open cidToUnicode file '%s'",
87
fileName->getCString());
92
mapA = (Unicode *)gmallocn(size, sizeof(Unicode));
95
while (getLine(buf, sizeof(buf), f)) {
96
if (mapLenA == size) {
98
mapA = (Unicode *)greallocn(mapA, size, sizeof(Unicode));
100
if (sscanf(buf, "%x", &u) == 1) {
103
error(-1, "Bad line (%d) in cidToUnicode file '%s'",
104
(int)(mapLenA + 1), fileName->getCString());
111
ctu = new CharCodeToUnicode(collection->copy(), mapA, mapLenA, gTrue,
117
CharCodeToUnicode *CharCodeToUnicode::parseUnicodeToUnicode(
118
GooString *fileName) {
121
CharCodeToUnicodeString *sMapA;
122
CharCode size, oldSize, len, sMapSizeA, sMapLenA;
127
Unicode *uBuf = (Unicode *)gmallocn(uBufSize, sizeof(Unicode));
128
CharCodeToUnicode *ctu;
132
if (!(f = fopen(fileName->getCString(), "r"))) {
134
error(-1, "Couldn't open unicodeToUnicode file '%s'",
135
fileName->getCString());
140
mapA = (Unicode *)gmallocn(size, sizeof(Unicode));
141
memset(mapA, 0, size * sizeof(Unicode));
144
sMapSizeA = sMapLenA = 0;
147
while (getLine(buf, sizeof(buf), f)) {
149
if (!(tok = strtok_r(buf, " \t\r\n", &tokptr)) ||
150
sscanf(tok, "%x", &u0) != 1) {
151
error(-1, "Bad line (%d) in unicodeToUnicode file '%s'",
152
line, fileName->getCString());
156
while ((tok = strtok_r(NULL, " \t\r\n", &tokptr))) {
160
uBuf = (Unicode *)greallocn(uBuf, uBufSize, sizeof(Unicode));
162
if (sscanf(tok, "%x", &uBuf[n]) != 1) {
163
error(-1, "Bad line (%d) in unicodeToUnicode file '%s'",
164
line, fileName->getCString());
170
error(-1, "Bad line (%d) in unicodeToUnicode file '%s'",
171
line, fileName->getCString());
179
mapA = (Unicode *)greallocn(mapA, size, sizeof(Unicode));
180
memset(mapA + oldSize, 0, (size - oldSize) * sizeof(Unicode));
186
if (sMapLenA == sMapSizeA) {
188
sMapA = (CharCodeToUnicodeString *)
189
greallocn(sMapA, sMapSizeA, sizeof(CharCodeToUnicodeString));
191
sMapA[sMapLenA].c = u0;
192
sMapA[sMapLenA].u = (Unicode*)gmallocn(n, sizeof(Unicode));
193
for (i = 0; i < n; ++i) {
194
sMapA[sMapLenA].u[i] = uBuf[i];
196
sMapA[sMapLenA].len = n;
205
ctu = new CharCodeToUnicode(fileName->copy(), mapA, len, gTrue,
206
sMapA, sMapLenA, sMapSizeA);
212
CharCodeToUnicode *CharCodeToUnicode::make8BitToUnicode(Unicode *toUnicode) {
213
return new CharCodeToUnicode(NULL, toUnicode, 256, gTrue, NULL, 0, 0);
216
CharCodeToUnicode *CharCodeToUnicode::parseCMap(GooString *buf, int nBits) {
217
CharCodeToUnicode *ctu;
220
ctu = new CharCodeToUnicode(NULL);
221
p = buf->getCString();
222
ctu->parseCMap1(&getCharFromString, &p, nBits);
226
CharCodeToUnicode *CharCodeToUnicode::parseCMapFromFile(GooString *fileName,
228
CharCodeToUnicode *ctu;
231
ctu = new CharCodeToUnicode(NULL);
232
if ((f = globalParams->findToUnicodeFile(fileName))) {
233
ctu->parseCMap1(&getCharFromFile, f, nBits);
236
error(-1, "Couldn't find ToUnicode CMap file for '%s'",
237
fileName->getCString());
242
void CharCodeToUnicode::mergeCMap(GooString *buf, int nBits) {
245
p = buf->getCString();
246
parseCMap1(&getCharFromString, &p, nBits);
249
void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data,
252
char tok1[256], tok2[256], tok3[256];
253
int nDigits, n1, n2, n3;
255
CharCode code1, code2;
260
pst = new PSTokenizer(getCharFunc, data);
261
pst->getToken(tok1, sizeof(tok1), &n1);
262
while (pst->getToken(tok2, sizeof(tok2), &n2)) {
263
if (!strcmp(tok2, "usecmap")) {
264
if (tok1[0] == '/') {
265
name = new GooString(tok1 + 1);
266
if ((f = globalParams->findToUnicodeFile(name))) {
267
parseCMap1(&getCharFromFile, f, nBits);
270
error(-1, "Couldn't find ToUnicode CMap file for '%s'",
275
pst->getToken(tok1, sizeof(tok1), &n1);
276
} else if (!strcmp(tok2, "beginbfchar")) {
277
while (pst->getToken(tok1, sizeof(tok1), &n1)) {
278
if (!strcmp(tok1, "endbfchar")) {
281
if (!pst->getToken(tok2, sizeof(tok2), &n2) ||
282
!strcmp(tok2, "endbfchar")) {
283
error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
286
if (!(n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' &&
287
tok2[0] == '<' && tok2[n2 - 1] == '>')) {
288
if (!(n1 == 4 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' && tok1[1] == '0' && tok1[2] == '0' &&
289
tok2[0] == '<' && tok2[n2 - 1] == '>')) {
290
error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
294
tok1[n1 - 1] = tok2[n2 - 1] = '\0';
295
if (sscanf(tok1 + 1, "%x", &code1) != 1) {
296
error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
299
addMapping(code1, tok2 + 1, n2 - 2, 0);
301
pst->getToken(tok1, sizeof(tok1), &n1);
302
} else if (!strcmp(tok2, "beginbfrange")) {
303
while (pst->getToken(tok1, sizeof(tok1), &n1)) {
304
if (!strcmp(tok1, "endbfrange")) {
307
if (!pst->getToken(tok2, sizeof(tok2), &n2) ||
308
!strcmp(tok2, "endbfrange") ||
309
!pst->getToken(tok3, sizeof(tok3), &n3) ||
310
!strcmp(tok3, "endbfrange")) {
311
error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
314
if (!(((n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>') ||
315
(n1 == 4 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' && tok1[1] == '0' && tok1[2] == '0')) &&
316
((n2 == 2 + nDigits && tok2[0] == '<' && tok2[n2 - 1] == '>') ||
317
(n2 == 4 + nDigits && tok2[0] == '<' && tok2[n2 - 1] == '>' && tok1[1] == '0' && tok1[2] == '0')))) {
318
error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
321
tok1[n1 - 1] = tok2[n2 - 1] = '\0';
322
if (sscanf(tok1 + 1, "%x", &code1) != 1 ||
323
sscanf(tok2 + 1, "%x", &code2) != 1) {
324
error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
327
if (!strcmp(tok3, "[")) {
329
while (pst->getToken(tok1, sizeof(tok1), &n1) &&
330
code1 + i <= code2) {
331
if (!strcmp(tok1, "]")) {
334
if (tok1[0] == '<' && tok1[n1 - 1] == '>') {
336
addMapping(code1 + i, tok1 + 1, n1 - 2, 0);
338
error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
342
} else if (tok3[0] == '<' && tok3[n3 - 1] == '>') {
344
for (i = 0; code1 <= code2; ++code1, ++i) {
345
addMapping(code1, tok3 + 1, n3 - 2, i);
349
error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
352
pst->getToken(tok1, sizeof(tok1), &n1);
360
void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n,
367
if (code >= mapLen) {
369
mapLen = (code + 256) & ~255;
370
if (unlikely(code >= mapLen)) {
371
error(-1, "Illegal code value in CharCodeToUnicode::addMapping");
374
map = (Unicode *)greallocn(map, mapLen, sizeof(Unicode));
375
for (i = oldLen; i < mapLen; ++i) {
381
if (sscanf(uStr, "%x", &u) != 1) {
382
error(-1, "Illegal entry in ToUnicode CMap");
385
map[code] = u + offset;
387
if (sMapLen >= sMapSize) {
388
sMapSize = sMapSize + 16;
389
sMap = (CharCodeToUnicodeString *)
390
greallocn(sMap, sMapSize, sizeof(CharCodeToUnicodeString));
393
sMap[sMapLen].c = code;
394
sMap[sMapLen].len = n / 4;
395
sMap[sMapLen].u = (Unicode*)gmallocn(sMap[sMapLen].len, sizeof(Unicode));
396
for (j = 0; j < sMap[sMapLen].len; ++j) {
397
strncpy(uHex, uStr + j*4, 4);
399
if (sscanf(uHex, "%x", &sMap[sMapLen].u[j]) != 1) {
400
error(-1, "Illegal entry in ToUnicode CMap");
403
sMap[sMapLen].u[sMap[sMapLen].len - 1] += offset;
408
CharCodeToUnicode::CharCodeToUnicode(GooString *tagA) {
413
map = (Unicode *)gmallocn(mapLen, sizeof(Unicode));
414
for (i = 0; i < mapLen; ++i) {
418
sMapLen = sMapSize = 0;
425
CharCodeToUnicode::CharCodeToUnicode(GooString *tagA, Unicode *mapA,
426
CharCode mapLenA, GBool copyMap,
427
CharCodeToUnicodeString *sMapA,
428
int sMapLenA, int sMapSizeA) {
432
map = (Unicode *)gmallocn(mapLen, sizeof(Unicode));
433
memcpy(map, mapA, mapLen * sizeof(Unicode));
439
sMapSize = sMapSizeA;
446
CharCodeToUnicode::~CharCodeToUnicode() {
452
for (int i = 0; i < sMapLen; ++i) gfree(sMap[i].u);
456
gDestroyMutex(&mutex);
460
void CharCodeToUnicode::incRefCnt() {
466
gUnlockMutex(&mutex);
470
void CharCodeToUnicode::decRefCnt() {
476
done = --refCnt == 0;
478
gUnlockMutex(&mutex);
485
GBool CharCodeToUnicode::match(GooString *tagA) {
486
return tag && !tag->cmp(tagA);
489
void CharCodeToUnicode::setMapping(CharCode c, Unicode *u, int len) {
495
for (i = 0; i < sMapLen; ++i) {
496
if (sMap[i].c == c) {
502
if (sMapLen == sMapSize) {
504
sMap = (CharCodeToUnicodeString *)
505
greallocn(sMap, sMapSize, sizeof(CharCodeToUnicodeString));
512
sMap[i].u = (Unicode*)gmallocn(len, sizeof(Unicode));
513
for (j = 0; j < len; ++j) {
519
int CharCodeToUnicode::mapToUnicode(CharCode c, Unicode **u) {
529
for (i = sMapLen - 1; i >= 0; --i) { // in reverse so CMap takes precedence
530
if (sMap[i].c == c) {
538
int CharCodeToUnicode::mapToCharCode(Unicode* u, CharCode *c, int usize) {
539
//look for charcode in map
541
for (CharCode i=0; i<mapLen; i++) {
550
//for each entry in the sMap
551
for (i=0; i<sMapLen; i++) {
552
//if the entry's unicode length isn't the same are usize, the strings
553
// are obviously differents
554
if (sMap[i].len != usize) continue;
555
//compare the string char by char
556
for (j=0; j<sMap[i].len; j++) {
557
if (sMap[i].u[j] != u[j]) {
562
//we have the same strings
563
if (j==sMap[i].len) {
572
//------------------------------------------------------------------------
574
CharCodeToUnicodeCache::CharCodeToUnicodeCache(int sizeA) {
578
cache = (CharCodeToUnicode **)gmallocn(size, sizeof(CharCodeToUnicode *));
579
for (i = 0; i < size; ++i) {
584
CharCodeToUnicodeCache::~CharCodeToUnicodeCache() {
587
for (i = 0; i < size; ++i) {
589
cache[i]->decRefCnt();
595
CharCodeToUnicode *CharCodeToUnicodeCache::getCharCodeToUnicode(GooString *tag) {
596
CharCodeToUnicode *ctu;
599
if (cache[0] && cache[0]->match(tag)) {
600
cache[0]->incRefCnt();
603
for (i = 1; i < size; ++i) {
604
if (cache[i] && cache[i]->match(tag)) {
606
for (j = i; j >= 1; --j) {
607
cache[j] = cache[j - 1];
617
void CharCodeToUnicodeCache::add(CharCodeToUnicode *ctu) {
620
if (cache[size - 1]) {
621
cache[size - 1]->decRefCnt();
623
for (i = size - 1; i >= 1; --i) {
624
cache[i] = cache[i - 1];