~ubuntu-branches/ubuntu/hardy/sigscheme/hardy-proposed

Viewing changes to encoding.c

Committer: Bazaar Package Importer
Author(s): NIIBE Yutaka
Date: 2006-05-23 21:46:41 UTC
mfrom: (1.1.1 upstream)
Revision ID: james.westby@ubuntu.com-20060523214641-6ix4gz34wpiehub8

Tags: 0.5.0-2

http://bugs.debian.org/368571

* debian/control (Build-Depends): Added ruby.
Thanks to Frederik Schueler. Closes: #368571
* debian/rules (clean): invoke 'distclean' instead of 'clean'.

files added:
MEMO.tkng

Makefile.am

Makefile.in

NEWS

README

TODO

aclocal.m4

autogen.sh

autom4te.cache

autom4te.cache/output.0

autom4te.cache/output.1

autom4te.cache/requests

autom4te.cache/traces.0

autom4te.cache/traces.1

bench

bench/bench-arithint.scm

bench/bench-case.scm

bench/bench-cpstak.scm

bench/bench-fib.scm

bench/bench-let-loop.scm

bench/bench-loop.scm

bench/bench-mem.scm

bench/bench-rec.scm

bench/bench-tak.scm

bench/bench-takl.scm

bench/bench-takr.scm

compare-scm.sh

compile

config.guess

config.h.in

config.sub

configure

configure.in

depcomp

doc/design.txt

doc/spec.txt

doc/style.txt

experimental

experimental/imm-const

experimental/imm-const/Makefile

experimental/imm-const/imm-test-gcc32-Os-armv5te-thumb.txt

experimental/imm-const/imm-test-gcc32-Os-armv5te.txt

experimental/imm-const/imm-test-gcc32-Os-m68k.txt

experimental/imm-const/imm-test-gcc32-Os-mips.txt

experimental/imm-const/imm-test-gcc32-Os-mips16.txt

experimental/imm-const/imm-test-gcc32-Os-sh2.txt

experimental/imm-const/imm-test-gcc32-Os-sh3.txt

experimental/imm-const/imm-test-gcc32-Os-sh4.txt

experimental/imm-const/imm-test-gcc34-Os-avr.txt

experimental/imm-const/imm-test-gcc34-Os-i486.txt

experimental/imm-const/imm-test-gcc34-Os-ppro-i486.txt

experimental/imm-const/imm-test-gcc34-Os-ppro.txt

experimental/imm-const/imm-test.c

install-sh

ltmain.sh

m4/Makefile.am

m4/Makefile.in

m4/ax_c___attribute__.m4

m4/ax_check_page_aligned_malloc.m4

m4/ax_create_stdint_h.m4

m4/ax_lib_glibc.m4

misc

misc/scm-obj-compact-gdbinit

missing

runbench.sh

runtest-tail-rec.sh

runtest.sh

slib.scm

src/Makefile.am

src/Makefile.in

src/alloc.c

src/basecport.c

src/baseport.h

src/c_template

src/char.c

src/config-asprintf.h

src/config-nonstd-string.h

src/config.h

src/deep-cadrs.c

src/encoding.c

src/encoding.h

src/env.c

src/error.c

src/eval.c

src/fileport.c

src/fileport.h

src/functable-r5rs-procedure.c

src/functable-r5rs-syntax.c

src/functable-siod.c

src/functable-srfi1.c

src/functable-srfi2.c

src/functable-srfi23.c

src/functable-srfi34.c

src/functable-srfi38.c

src/functable-srfi6.c

src/functable-srfi60.c

src/functable-srfi8.c

src/functable-sscm-core.c

src/functable-sscm-ext.c

src/h_template

src/list.c

src/load.c

src/main.c

src/mbcport.c

src/mbcport.h

src/module-siod.c

src/module-srfi1.c

src/module-srfi2.c

src/module-srfi23.c

src/module-srfi34.c

src/module-srfi38.c

src/module-srfi6.c

src/module-srfi60.c

src/module-srfi8.c

src/module-sscm-ext.c

src/module.c

src/my-stdint.h

src/nullport.c

src/nullport.h

src/number.c

src/port.c

src/procedure.c

src/read.c

src/sbcport.c

src/sbcport.h

src/scm_functype_mand_max15.diff

src/script

src/script/build_func_table.rb

src/script/check_declare_func_typo.rb

src/script/check_initialize_scm_null.rb

src/script/functable-footer.txt

src/script/functable-header.txt

src/script/scm_decl.rb

src/sigscheme.c

src/sigscheme.h

src/sigschemeinternal.h

src/storage-compact.h

src/storage-continuation.c

src/storage-fatty.h

src/storage-gc.c

src/storage-protection.c

src/storage-symbol.c

src/storage.c

src/string.c

src/strport.c

src/strport.h

src/syntax.c

src/test-compact.c

src/vector.c

src/write.c

test

test/bigloo-apply.scm

test/bigloo-bchar.scm

test/bigloo-bool.scm

test/bigloo-case.scm

test/bigloo-letrec.scm

test/bigloo-list.scm

test/bigloo-quote.scm

test/bigloo-vector.scm

test/define.scm

test/for-each.scm

test/gauche-primsyn.scm

test/io.scm

test/test-apply.scm

test/test-char-cmp.scm

test/test-char.scm

test/test-continuation.scm

test/test-define.scm

test/test-delay-force.scm

test/test-enc-eucgeneric.scm

test/test-enc-eucjp.scm

test/test-enc-sjis.scm

test/test-enc-utf8.scm

test/test-equation.scm

test/test-eval.scm

test/test-exp.scm

test/test-list.scm

test/test-map.scm

test/test-num.scm

test/test-quote.scm

test/test-r4rs.scm

test/test-srfi1.scm

test/test-srfi2.scm

test/test-srfi34-2.scm

test/test-srfi34.scm

test/test-srfi38.scm

test/test-srfi6.scm

test/test-srfi60.scm

test/test-srfi8.scm

test/test-string-cmp.scm

test/test-string.scm

test/test-syntax.scm

test/test-tail-rec.scm

test/test-vector.scm

test/unittest-bigloo.scm

test/unittest-gauche.scm

test/unittest.scm

files removed:
.bzr-builddeb

.bzr-builddeb/default.conf

COPYING

Makefile

baseport.h

config.h

datas.c

debug.c

encoding.c

error.c

eval.c

fileport.c

fileport.h

io.c

main.c

operations-siod.c

operations-srfi1.c

operations-srfi2.c

operations-srfi23.c

operations-srfi34.c

operations-srfi38.c

operations-srfi6.c

operations-srfi60.c

operations-srfi8.c

operations.c

read.c

sbcport.c

sbcport.h

sigscheme.1

sigscheme.c

sigscheme.h

sigschemeinternal.h

sigschemetype-compact.h

sigschemetype.h

storage-protection.c

test-compact.c

files modified:
AUTHORS

debian/changelog

debian/control

debian/rules

debian/watch

Show diffs side-by-side

added added

removed removed

encoding.c

/*===========================================================================

* FileName : encoding.c

* About : handling encoding

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

* are met:

* 1. Redistributions of source code must retain the above copyright

* notice, this list of conditions and the following disclaimer.

* 2. Redistributions in binary form must reproduce the above copyright

* notice, this list of conditions and the following disclaimer in the

* documentation and/or other materials provided with the distribution.

* 3. Neither the name of authors nor the names of its contributors

* may be used to endorse or promote products derived from this software

* without specific prior written permission.

* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''

* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE

* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE

* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)

* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

* SUCH DAMAGE.

===========================================================================*/

/* Acknowledgement: much information was gained from the

* i18n-introduction of the debian project. Many thanks to its

* authors, Tomohiro KUBOTA, et al. */

/*=======================================

System Include

=======================================*/

/*=======================================

Local Include

=======================================*/

#include "sigscheme.h"

#include "sigschemeinternal.h"

/*=======================================

File Local Functions

=======================================*/

#if SCM_USE_EUCJP

static ScmMultibyteCharInfo eucjp_scan_char(ScmMultibyteString mbs);

#endif

#if SCM_USE_ISO2022KR

static ScmMultibyteCharInfo iso2022kr_scan_char(ScmMultibyteString mbs);

static ScmMultibyteCharInfo iso2022kr_scan_input_char(ScmMultibyteString mbs);

#endif

#if SCM_USE_ISO2022JP

static ScmMultibyteCharInfo iso2022jp_scan_char(ScmMultibyteString mbs);

static ScmMultibyteCharInfo iso2022jp_scan_input_char(ScmMultibyteString mbs);

#endif

#if SCM_USE_SJIS

static ScmMultibyteCharInfo sjis_scan_char(ScmMultibyteString mbs);

#endif

#if SCM_USE_EUCCN

static ScmMultibyteCharInfo euccn_scan_char(ScmMultibyteString mbs);

#endif

#if SCM_USE_EUCKR

static ScmMultibyteCharInfo euckr_scan_char(ScmMultibyteString mbs);

#endif

#if SCM_USE_UTF8

static ScmMultibyteCharInfo utf8_scan_char(ScmMultibyteString mbs);

#endif

static ScmMultibyteCharInfo unibyte_scan_char(ScmMultibyteString mbs);

typedef unsigned char uchar;

/*=======================================

Global Variables

=======================================*/

/* TODO: add some mechanism to dynamically switch between encodings. */

ScmMultibyteCharInfo (*Scm_mb_scan_char)(ScmMultibyteString mbs)

= utf8_scan_char;

/*=======================================

Public API

=======================================*/

int Scm_mb_strlen(ScmMultibyteString mbs)

{

100

int len = 0;

101

ScmMultibyteCharInfo c;

102

103

CDBG((SCM_DBG_ENCODING, "mb_strlen: size = %d; str = %s;",

104

SCM_MBS_GET_SIZE(mbs), SCM_MBS_GET_STR(mbs)));

105

106

while (SCM_MBS_GET_SIZE(mbs)) {

107

c = Scm_mb_scan_char(mbs);

108

CDBG((SCM_DBG_ENCODING, "%d, %d;", SCM_MBCINFO_GET_SIZE(c), c.flag));

109

SCM_MBS_SKIP_CHAR(mbs, c);

110

len++;

111

}

112

113

CDBG((SCM_DBG_ENCODING, "len=%d\n", len));

114

return len;

115

}

116

117

/* FIXME: pick a better name. */

118

int Scm_mb_bare_c_strlen(const char *s)

119

{

120

ScmMultibyteString mbs;

121

SCM_MBS_INIT(mbs);

122

SCM_MBS_SET_STR(mbs, s);

123

SCM_MBS_SET_SIZE(mbs, strlen(s));

124

return Scm_mb_strlen(mbs);

125

}

126

127

ScmMultibyteString Scm_mb_substring(ScmMultibyteString mbs, int i, int len)

128

{

129

ScmMultibyteString ret;

130

ScmMultibyteString end;

131

ScmMultibyteCharInfo c;

132

133

ret = mbs;

134

135

while (i--) {

136

c = Scm_mb_scan_char(ret);

137

SCM_MBS_SKIP_CHAR(ret, c);

138

}

139

140

end = ret;

141

142

while (len--) {

143

c = Scm_mb_scan_char(end);

144

SCM_MBS_SKIP_CHAR(end, c);

145

}

146

147

SCM_MBS_SET_SIZE(ret, SCM_MBS_GET_STR(end) - SCM_MBS_GET_STR(ret));

148

return ret;

149

}

150

151

152

/*=======================================

153

Encoding-specific functions

154

=======================================*/

155

156

/* Every encoding implements the <encoding name>_scan_char()

157

* primitive. Its job is to determine the length of the first

158

* character in the given string. Stateful encodings should save

159

* their state *at exit*, that is, the state right after reading the

160

* first character (so don't omit it). */

161

162

/* Convenience macros. Start with ENTER and return with RETURN*.

163

* EXPECT_SIZE() declares the expected length of the character. We'll

164

* use it to return information on how many octets are missing. It

165

* also serves as documentation. */

166

#define ENTER ScmMultibyteCharInfo _ret; SCM_MBCINFO_INIT(_ret)

167

#define RETURN(n) do { SCM_MBCINFO_SET_SIZE(_ret, n); return _ret; } while (0)

168

#define RETURN_ERROR() do { SCM_MBCINFO_SET_ERROR(_ret); RETURN(1); } while (0)

169

#define RETURN_INCOMPLETE(n) do { SCM_MBCINFO_SET_INCOMPLETE(_ret); RETURN(n); } while (0)

170

#define SAVE_STATE(stat) (SCM_MBCINFO_SET_STATE(_ret, (stat)))

171

#define EXPECT_SIZE(size) /* Currently ignored. */

172

173

/* Encodings based on ISO/IEC 2022. */

174

175

/* Control regions. */

176

#define IN_CL(c) ((uchar)(c) < 0x20)

177

#define IN_CR(c) (0x80 <= (uchar)(c) && (uchar)(c) <= 0x9F)

178

179

/* General purpose regions. */

180

#define IN_GL94(c) (0x21 <= (uchar)(c) && (uchar)(c) <= 0x7E)

181

#define IN_GL96(c) (0x20 <= (uchar)(c) && (uchar)(c) <= 0x7F)

182

#define IN_GR94(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xFE)

183

#define IN_GR96(c) (0xA0 <= (uchar)(c) && (uchar)(c) <= 0xFF)

184

185

#define IS_ASCII(c) ((uchar)(c) <= 0x7F)

186

#define IS_GR_SPC_OR_DEL(c) ((uchar)(c) == 0xA0 || (uchar)(c) == 0xFF)

187

188

#define ESC 0x1B

189

#define SO 0x0E

190

#define SI 0x0F

191

#define SS2 0x8E

192

#define SS3 0x8F

193

194

195

#if SCM_USE_EUCJP

196

/* G0 <- (96) ASCII (or was it JIS X 0201 Roman?)

197

* G1 <- (94x94) JIS X 0208 kanji/kana

198

* G2 <- (94) JIS X 0201 Katakana ("half-width katakana")

199

* G3 <- (94x94) JIS X 0212 kanji, or JIS X 0213 kanji plane 2

200

201

* GL <- G0 (ASCII)

202

* GR <- G1 (JIS X 0208)

203

* CL <- JIS X 0211 C0

204

* CR <- JIS X 0211 C1 */

205

static ScmMultibyteCharInfo eucjp_scan_char(ScmMultibyteString mbs)

206

{

207

const char *str = SCM_MBS_GET_STR(mbs);

208

const int size = SCM_MBS_GET_SIZE(mbs);

209

ENTER;

210

211

if (!size)

212

RETURN(0);

213

214

if (IN_CL(str[0]) || IN_GL96(str[0]))

215

RETURN(1);

216

else if (IN_GR94(str[0]) || (uchar)str[0] == SS2) {

217

EXPECT_SIZE(2);

218

if (size < 2) RETURN_INCOMPLETE(1);

219

#if SCM_STRICT_ENCODING_CHECK

220

if (!IN_GR96(str[1])) RETURN_ERROR();

221

#endif

222

RETURN(2);

223

} else if ((uchar)str[0] == SS3) {

224

EXPECT_SIZE(3);

225

#if SCM_STRICT_ENCODING_CHECK

226

if (size < 2) RETURN_INCOMPLETE(size);

227

if (IS_GR_SPC_OR_DEL(str[1]))

228

RETURN(2);

229

if (!IN_GR94(str[1])) RETURN_ERROR();

230

if (size < 3) RETURN_INCOMPLETE(size);

231

if (!IN_GR94(str[2])) RETURN_ERROR();

232

RETURN(3);

233

#else /* not SCM_STRICT_ENCODING_CHECK */

234

if (size < 3)

235

RETURN_INCOMPLETE(size);

236

RETURN(3);

237

#endif /* not SCM_STRICT_ENCODING_CHECK */

238

}

239

240

RETURN_ERROR();

241

}

242

#endif /* SCM_USE_EUCJP */

243

244

#if SCM_USE_EUCCN

245

/* FIXME: NOT TESTED!

246

247

* G0 <- ASCII (or GB 1988?)

248

* G1 <- GB2312

249

250

* GL <- G0 (ASCII)

251

* GR <- G1 (GB2312) */

252

static ScmMultibyteCharInfo euccn_scan_char(ScmMultibyteString mbs)

253

{

254

/* TODO: maybe we can make this an alias of eucjp_scan_char()? */

255

const char *str = SCM_MBS_GET_STR(mbs);

256

const int size = SCM_MBS_GET_SIZE(mbs);

257

ENTER;

258

259

if (!size)

260

RETURN(0);

261

if (IS_ASCII(str[0]))

262

RETURN(1);

263

if (IN_GR94(str[0])) {

264

EXPECT_SIZE(2);

265

if (size < 2)

266

RETURN_INCOMPLETE(size);

267

#if SCM_STRICT_ENCODING_CHECK

268

if (!IN_GR96(str[1]))

269

RETURN_ERROR();

270

#endif

271

RETURN(2);

272

}

273

RETURN_ERROR();

274

}

275

#endif

276

277

#if SCM_USE_EUCKR

278

/* FIXME: NOT TESTED! I'm not sure about this encoding. There's also

279

* a Microsoft variant called CP949, which is not supported (yet).

280

* RFC 1557 says KS X 1001 is 94x94.

281

282

* G0 <- ASCII

283

* G1 <- KS X 1001 (aka KSC 5601)

284

285

* GL <- G0

286

* GR <- G1 */

287

static ScmMultibyteCharInfo euckr_scan_char(ScmMultibyteString mbs)

288

{

289

const char *str = SCM_MBS_GET_STR(mbs);

290

const int size = SCM_MBS_GET_SIZE(mbs);

291

ENTER;

292

293

if (!size)

294

RETURN(0);

295

if (IS_ASCII(str[0]))

296

RETURN(1);

297

if (IN_GR94(str[0])) {

298

EXPECT_SIZE(2);

299

if (size < 2)

300

RETURN_INCOMPLETE(size);

301

#if SCM_STRICT_ENCODING_CHECK

302

if (!IN_GR96(str[1]))

303

RETURN_ERROR();

304

#endif

305

RETURN(2);

306

}

307

RETURN_ERROR();

308

}

309

#endif /* SCM_USE_EUCKR */

310

311

/*==== Encodings for Unicode ====*/

312

#if SCM_USE_UTF8

313

/* RFC 3629 */

314

#define MASK(n) ((LEN_CODE(n) >> 1) | 0x80)

315

#define LEN_CODE(n) (((1 << (n))-1) << (8-n))

316

#define IS_LEN(c, n) ((MASK(n) & (c)) == LEN_CODE(n))

317

#define IS_TRAILING(c) (IS_LEN((c), 1))

318

319

static ScmMultibyteCharInfo utf8_scan_char(ScmMultibyteString mbs)

320

{

321

const char *str = SCM_MBS_GET_STR(mbs);

322

const int size = SCM_MBS_GET_SIZE(mbs);

323

int len;

324

ENTER;

325

326

if (!size)

327

RETURN(0);

328

if (IS_ASCII(str[0]))

329

RETURN(1);

330

331

if (IS_LEN(str[0], 2)) len = 2;

332

else if (IS_LEN(str[0], 3)) len = 3;

333

else if (IS_LEN(str[0], 4)) len = 4;

334

else RETURN_ERROR();

335

336

#if SCM_STRICT_ENCODING_CHECK

337

{

338

int i;

339

for (i=1; i < len; i++) {

340

if (size <= i)

341

RETURN_INCOMPLETE(size);

342

if (!IS_TRAILING(str[i]))

343

RETURN_ERROR();

344

}

345

}

346

#else /* not SCM_STRICT_ENCODING_CHECK */

347

if (size < len)

348

RETURN_INCOMPLETE(size);

349

#endif /* not SCM_STRICT_ENCODING_CHECK */

350

351

RETURN(len);

352

353

}

354

355

#undef MASK

356

#undef LEN_CODE

357

#undef IS_LEN

358

#undef IS_TRAILING

359

#endif /* SCM_USE_UTF8 */

360

361

/*==== Other encodings ====*/

362

363

#if SCM_USE_SJIS

364

/* The cwazy Japanese encoding. This function implements the JIS X

365

* 0213 variant.

366

367

* 0 .. 0x7F: ASCII

368

* 0x80: undefined

369

* 0x81 .. 0x9F: lead byte of 2-byte char

370

* 0xA0: undefined

371

* 0xA1 .. 0xDF: JIS X 0201 katakana (1 byte)

372

* 0xE0 .. 0xEF: lead byte of 2-byte char

373

* 0xF0 .. 0xFC: lead byte of 2-byte char if JIS X 0213 is used

374

* 0xFD .. 0xFF: undefined

375

376

* 0x40 .. 0x7E: trailing byte of 2-byte char

377

* 0x80 .. 0xFC: trailing byte of 2-byte char

378

379

static ScmMultibyteCharInfo sjis_scan_char(ScmMultibyteString mbs)

380

{

381

#define IS_KANA(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xDF)

382

#define IS_LEAD(c) \

383

(0x81 <= (uchar)(c) \

384

&& !IS_KANA(c) \

385

&& (uchar)(c) <= 0xFC \

386

&& (uchar)(c) != 0xA0)

387

#define IS_TRAIL(c) (0x40 <= (uchar)(c) && (uchar)(c) <= 0xFC && (c) != 0x7E)

388

389

const char *str = SCM_MBS_GET_STR(mbs);

390

const int size = SCM_MBS_GET_SIZE(mbs);

391

ENTER;

392

if (!size)

393

RETURN(0);

394

if (IS_LEAD(str[0])) {

395

EXPECT_SIZE(2);

396

if (size < 2)

397

RETURN_INCOMPLETE(size);

398

#if SCM_STRICT_ENCODING_CHECK

399

if (!IS_TRAIL(str[1]))

400

RETURN_ERROR();

401

#endif

402

RETURN(2);

403

}

404

RETURN(1);

405

406

#undef IS_KANA

407

#undef IS_LEAD

408

#undef IS_TRAIL

409

}

410

#endif /* SCM_USE_SJIS */

411

412

/* Single-byte encodings. Please add any that you know are missing.

413

* Sorted alphabetically.

414

415

* ASCII

416

* ISO 646

417

* ISO-8859-*

418

* VISCII

419

420

static ScmMultibyteCharInfo unibyte_scan_char(ScmMultibyteString mbs)

421

{

422

ENTER;

423

if (SCM_MBS_GET_SIZE(mbs))

424

RETURN(1);

425

RETURN(0);

426

}

Older »