~ubuntu-branches/debian/sid/httrack/sid

« back to all changes in this revision

Viewing changes to src/htscharset.c

Committer: Package Import Robot
Author(s): Xavier Roche
Date: 2014-04-09 21:02:08 UTC
mfrom: (1.4.46)
Revision ID: package-import@ubuntu.com-20140409210208-n2o92f7nj5g82p7k

Tags: 3.48.1-1

Updated to 3.48.1 (3.48-1)

files added:
compile

lang/Croatian.txt

libtest/example-main.c

libtest/example-main.h

src/htsbasiccharsets.sh

src/htscodepages.h

src/murmurhash3.h

tests/01_engine-hashtable.test

files removed:
libtest/example.c

libtest/example.h

src/htsmms.c

src/htsmms.h

src/mmsrip

src/mmsrip/AUTHORS

src/mmsrip/COPYING

src/mmsrip/ChangeLog

src/mmsrip/NEWS

src/mmsrip/README

src/mmsrip/common.h

src/mmsrip/error.c

src/mmsrip/error.h

src/mmsrip/main.c

src/mmsrip/mms.c

src/mmsrip/mms.h

files modified:
Makefile.in

config.h.in

configure

configure.ac

debian/changelog

debian/libhttrack2.files

history.txt

lang.def

lang.indexes

lang/Cesky.txt

lang/Chinese-BIG5.txt

lang/Chinese-Simplified.txt

lang/Eesti.txt

lang/English.txt

lang/Finnish.txt

lang/Francais.txt

lang/Japanese.txt

lang/Portugues-Brasil.txt

lang/Ukrainian.txt

libtest/readme.txt

m4/check_zlib.m4

src/Makefile.am

src/Makefile.in

src/htsalias.c

src/htsback.c

src/htsback.h

src/htsbase.h

src/htsbasenet.h

src/htsbauth.c

src/htscatchurl.c

src/htscharset.c

src/htscharset.h

src/htsconfig.h

src/htscore.c

src/htscore.h

src/htscoremain.c

src/htsftp.c

src/htsglobal.h

src/htshash.c

src/htshash.h

src/htshelp.c

src/htsindex.c

src/htsinthash.c

src/htsinthash.h

src/htsjava.c

src/htslib.c

src/htslib.h

src/htsmodules.c

src/htsname.c

src/htsnet.h

src/htsopt.h

src/htsparse.c

src/htsserver.c

src/htsserver.h

src/htsthread.h

src/htstools.c

src/htsweb.c

src/htswizard.c

src/htszlib.c

src/httrack-library.h

src/httrack.c

src/md5.h

src/proxy/proxytrack.h

src/proxy/store.c

tests/Makefile.am

tests/Makefile.in

Show diffs side-by-side

added added

removed removed

src/htscharset.c

return 1;

}

#define IS_ALNUM(C) ( ((C) >= 'A' && (C) <= 'Z') || ((C) >= 'a' && (C) <= 'z') || ((C) >= '0' && (C) <= '9') )

#define CHAR_LOWER(C) ( ((C) >= 'A' && (C) <= 'Z') ? ((C) + 'a' - 'A') : (C) )

static int hts_equalsAlphanum(const char *a, const char *b) {

size_t i, j;

for(i = 0, j = 0;; i++, j++) {

/* Skip non-alnum */

for(; a[i] != '\0' && !IS_ALNUM(a[i]); i++) ;

for(; b[j] != '\0' && !IS_ALNUM(b[j]); j++) ;

/* Compare */

if (CHAR_LOWER(a[i]) != CHAR_LOWER(b[j])) {

break;

}

/* End of string ? (note: a[i] == b[j]) */

else if (a[i] == '\0') {

return 1;

}

return 0;

}

#undef IS_ALNUM

#undef CHAR_LOWER

/* Copy the memory region [s .. s + size - 1 ] as a \0-terminated string. */

static char *hts_stringMemCopy(const char *s, size_t size) {

char *dest = malloc(size + 1);

if (dest != NULL) {

memcpy(dest, s, size);

dest[size] = '\0';

return dest;

}

return NULL;

}

#ifdef _WIN32

typedef struct wincodepage_t wincodepage_t;

207

241

UINT hts_getCodepage(const char *name) {

208

242

int id;

209

243

210

#define IS_ALNUM(C) ( ((C) >= 'A' && (C) <= 'Z') || ((C) >= 'a' && (C) <= 'z') || ((C) >= '0' && (C) <= '9') )

211

#define CHAR_LOWER(C) ( ((C) >= 'A' && (C) <= 'Z') ? ((C) + 'a' - 'A') : (C) )

212

244

for(id = 0; codepages[id].name != NULL; id++) {

213

int i, j;

214

215

245

/* Compare the two strings, lowercase and alphanum only (ISO88591 == iso-8859-1) */

216

const char *a = name, *b = codepages[id].name;

217

218

for(i = 0, j = 0;; i++, j++) {

219

/* Skip non-alnum */

220

for(; a[i] != '\0' && !IS_ALNUM(a[i]); i++) ;

221

for(; b[j] != '\0' && !IS_ALNUM(b[j]); j++) ;

222

/* Compare */

223

if (CHAR_LOWER(a[i]) != CHAR_LOWER(b[j])) {

224

break;

225

}

226

/* End of string ? (note: a[i] == b[j]) */

227

else if (a[i] == '\0') {

228

return codepages[id].codepage;

229

}

246

if (hts_equalsAlphanum(name, codepages[id].name)) {

247

return codepages[id].codepage;

230

248

}

231

249

}

232

#undef IS_ALNUM

233

#undef CHAR_LOWER

250

234

251

/* Not found */

235

252

return 0;

236

253

}

237

254

238

static char *strndup(const char *s, size_t size) {

239

char *dest = malloc(size + 1);

240

241

if (dest != NULL) {

242

memcpy(dest, s, size);

243

dest[size] = '\0';

244

return dest;

245

}

246

return NULL;

247

}

248

249

255

LPWSTR hts_convertStringToUCS2(const char *s, int size, UINT cp, int *pwsize) {

250

256

/* Size in wide chars of the output */

251

257

const int wsize = MultiByteToWideChar(cp, 0, (LPCSTR) s, size, NULL, 0);

302

308

char *hts_convertStringCPToUTF8(const char *s, size_t size, UINT cp) {

303

309

/* Empty string ? */

304

310

if (size == 0) {

305

return strndup(s, size);

311

return hts_stringMemCopy(s, size);

306

312

}

307

313

/* Already UTF-8 ? */

308

314

if (cp == CP_UTF8 || hts_isStringAscii(s, size)) {

309

return strndup(s, size);

315

return hts_stringMemCopy(s, size);

310

316

}

311

317

/* Other (valid) charset */

312

318

else if (cp != 0) {

329

335

char *hts_convertStringCPFromUTF8(const char *s, size_t size, UINT cp) {

330

336

/* Empty string ? */

331

337

if (size == 0) {

332

return strndup(s, size);

338

return hts_stringMemCopy(s, size);

333

339

}

334

340

/* Already UTF-8 ? */

335

341

if (cp == CP_UTF8 || hts_isStringAscii(s, size)) {

336

return strndup(s, size);

342

return hts_stringMemCopy(s, size);

337

343

}

338

344

/* Other (valid) charset */

339

345

else if (cp != 0) {

372

378

#else

373

379

374

380

#include <errno.h>

381

382

#if ( defined(HTS_USEICONV) && ( HTS_USEICONV == 0 ) )

383

#define DISABLE_ICONV

384

#endif

385

386

#ifndef DISABLE_ICONV

375

387

#include <iconv.h>

376

377

static char *hts_convertStringToUTF8_(const char *s, size_t size,

388

#else

389

#include "htscodepages.h"

390

391

/* decode from a codepage to UTF-8 */

392

static char* hts_codepageToUTF8(const char *codepage, const char *s) {

393

/* find the given codepage */

394

size_t i;

395

for(i = 0 ; table_mappings[i].name != NULL

396

&& !hts_equalsAlphanum(table_mappings[i].name, codepage) ; i++) ;

397

398

/* found ; decode */

399

if (table_mappings[i].name != NULL) {

400

size_t j, k;

401

char *dest = NULL;

402

size_t capa = 0;

403

#define MAX_UTF 8

404

for(j = 0, k = 0 ; s[j] != '\0' ; j++) {

405

const unsigned char c = (unsigned char) s[j];

406

const hts_UCS4 uc = table_mappings[i].table[c];

407

const size_t max = k + MAX_UTF;

408

if (capa < max) {

409

for(capa = 16 ; capa < max ; capa <<= 1) ;

410

dest = realloc(dest, capa);

411

if (dest == NULL) {

412

return NULL;

413

}

414

}

415

if (dest != NULL) {

416

const size_t len = hts_writeUTF8(uc, &dest[k], MAX_UTF);

417

k += len;

418

assert(k < capa);

419

}

420

}

421

dest[k] = '\0';

422

return dest;

423

#undef MAX_UTF

424

}

425

return NULL;

426

}

427

#endif

428

429

static char *hts_convertStringCharset(const char *s, size_t size,

378

430

const char *to, const char *from) {

379

431

/* Empty string ? */

380

432

if (size == 0) {

381

433

return strdup("");

382

434

}

383

435

/* Already on correct charset ? */

384

if (strcasecmp(from, to) == 0) {

385

return strndup(s, size);

436

if (hts_equalsAlphanum(from, to)) {

437

return hts_stringMemCopy(s, size);

386

438

}

439

#ifndef DISABLE_ICONV

387

440

/* Find codepage */

388

441

else {

389

442

const iconv_t cp = iconv_open(to, from);

442

495

return outbuf;

443

496

}

444

497

}

498

#else

499

/* Limited codepage decoding support only. */

500

if (hts_isCharsetUTF8(to)) {

501

return hts_codepageToUTF8(from, s);

502

}

503

#endif

445

504

446

505

/* Error, charset not found! */

447

506

return NULL;

454

513

}

455

514

/* Already UTF-8 ? */

456

515

if (hts_isCharsetUTF8(charset) || hts_isStringAscii(s, size)) {

457

return strndup(s, size);

516

return hts_stringMemCopy(s, size);

458

517

}

459

518

/* Find codepage */

460

519

else {

461

return hts_convertStringToUTF8_(s, size, "utf-8", charset);

520

return hts_convertStringCharset(s, size, "utf-8", charset);

462

521

}

463

522

}

464

523

469

528

}

470

529

/* Already UTF-8 ? */

471

530

if (hts_isCharsetUTF8(charset) || hts_isStringAscii(s, size)) {

472

return strndup(s, size);

531

return hts_stringMemCopy(s, size);

473

532

}

474

533

/* Find codepage */

475

534

else {

476

return hts_convertStringToUTF8_(s, size, charset, "utf-8");

535

return hts_convertStringCharset(s, size, charset, "utf-8");

477

536

}

478

537

}

479

538

1133

1192

return dest;

1134

1193

}

1135

1194

1195

int hts_isStringUTF8(const char *s, size_t size) {

1196

const unsigned char *const data = (const unsigned char*) s;

1197

size_t i;

1198

1199

for(i = 0 ; i < size ; ) {

1200

/* Reader: can read bytes up to j */

1201

#define RD ( i < size ? data[i++] : -1 )

1202

1203

/* Writer: upon error, return FFFD (replacement character) */

1204

#define WR(C) if ((C) == -1) { return 0; }

1205

1206

/* Read Unicode character. */

1207

READ_UNICODE(RD, WR);

1208

#undef RD

1209

#undef WR

1210

}

1211

1212

return 1;

1213

}

1214

1136

1215

char *hts_convertUCS4StringToUTF8(const hts_UCS4 *s, size_t nChars) {

1137

1216

size_t i;

1138

1217

char *dest = NULL;

Older »