~ubuntu-branches/ubuntu/karmic/libxerces2-java/karmic

2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise

236

3 the value of the encoding attribute if one exists, otherwise

237

4 UTF-8.

238

**/

239

if (contentType.equals("text/xml")) {

240

if (charset != null) {

241

detectedEncoding = charset;

242

}

243

else {

244

// see RFC2376 or 3023, section 3.1

245

detectedEncoding = "US-ASCII";

246

}

247

}

248

else if (contentType.equals("application/xml")) {

249

if (charset != null) {

250

detectedEncoding = charset;

251

}

252

else {

253

// see RFC2376 or 3023, section 3.2

254

detectedEncoding = getEncodingName(stream);

255

}

256

}

257

else if (contentType.endsWith("+xml")) {

258

detectedEncoding = getEncodingName(stream);

259

}

260

261

if (detectedEncoding != null) {

262

encoding = detectedEncoding;

263

}

264

// else 3 or 4.

265

}

266

267

encoding = encoding.toUpperCase(Locale.ENGLISH);

268

269

// eat the Byte Order Mark

270

consumeBOM(stream, encoding);

271

272

// If the document is UTF-8 or US-ASCII use

273

// the Xerces readers for these encodings.

274

if (encoding.equals("UTF-8")) {

275

return new UTF8Reader(stream,

276

XMLEntityManager.DEFAULT_BUFFER_SIZE,

277

fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),

278

fErrorReporter.getLocale() );

279

}

280

else if (encoding.equals("US-ASCII")) {

281

return new ASCIIReader(stream,

282

XMLEntityManager.DEFAULT_BUFFER_SIZE,

283

fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),

284

fErrorReporter.getLocale() );

285

}

286

287

// Try to use a Java reader.

288

String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);

289

290

// If the specified encoding wasn't a recognized IANA encoding throw an IOException.

291

// The XIncludeHandler will report this as a ResourceError and then will

292

// attempt to include a fallback if there is one.

293

if (javaEncoding == null) {

294

MessageFormatter aFormatter =

295

fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);

296

Locale aLocale = fErrorReporter.getLocale();

297

throw new IOException( aFormatter.formatMessage( aLocale,

298

"EncodingDeclInvalid",

299

new Object[] {encoding} ) );

300

}

301

302

return new InputStreamReader(stream, javaEncoding);

303

}

304

}

305

306

/**

307

* XMLEntityManager cares about endian-ness, since it creates its own optimized

308

* readers. Since we're just using generic Java readers for now, we're not caring

309

* about endian-ness. If this changes, even more code needs to be copied from

310

* XMLEntity manager. -- PJM

311

312

protected String getEncodingName(InputStream stream) throws IOException {

313

final byte[] b4 = new byte[4];

314

String encoding = null;

315

316

// this has the potential to throw an exception

317

// it will be fixed when we ensure the stream is rewindable (see note above)

318

stream.mark(4);

319

int count = stream.read(b4, 0, 4);

320

stream.reset();

321

if (count == 4) {

322

encoding = getEncodingName(b4);

323

}

324

325

return encoding;

326

}

327

328

/**

329

* Removes the byte order mark from the stream, if it exists.

330

* @param stream

331

* @param encoding

332

* @throws IOException

333

334

protected void consumeBOM(InputStream stream, String encoding)

335

throws IOException {

336

337

byte[] b = new byte[3];

338

int count = 0;

339

stream.mark(3);

340

if (encoding.equals("UTF-8")) {

341

count = stream.read(b, 0, 3);

342

if (count == 3) {

343

int b0 = b[0] & 0xFF;

344

int b1 = b[1] & 0xFF;

345

int b2 = b[2] & 0xFF;

346

if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {

347

// First three bytes are not BOM, so reset.

348

stream.reset();

349

}

350

}

351

else {

352

stream.reset();

353

}

354

}

355

else if (encoding.startsWith("UTF-16")) {

356

count = stream.read(b, 0, 2);

357

if (count == 2) {

358

int b0 = b[0] & 0xFF;

359

int b1 = b[1] & 0xFF;

360

if ((b0 != 0xFE || b1 != 0xFF)

361

&& (b0 != 0xFF || b1 != 0xFE)) {

362

// First two bytes are not BOM, so reset.

363

stream.reset();

364

}

365

}

366

else {

367

stream.reset();

368

}

369

}

370

// We could do UTF-32, but since the getEncodingName() doesn't support that

371

// we won't support it here.

372

// To implement UTF-32, look for: 00 00 FE FF for big-endian

373

// or FF FE 00 00 for little-endian

374

}

375

376

/**

377

* REVISIT: This code is taken from org.apache.xerces.impl.XMLEntityManager.

378

* Is there any way we can share the code, without having it implemented twice?

379

* I think we should make it public and static in XMLEntityManager. --PJM

380

381

* Returns the IANA encoding name that is auto-detected from

382

* the bytes specified, with the endian-ness of that encoding where appropriate.

383

384

* @param b4 The first four bytes of the input.

385

* @return the encoding name, or null if no encoding could be detected

386

387

protected String getEncodingName(byte[] b4) {

388

389

// UTF-16, with BOM

390

int b0 = b4[0] & 0xFF;

391

int b1 = b4[1] & 0xFF;

392

if (b0 == 0xFE && b1 == 0xFF) {

393

// UTF-16, big-endian

394

return "UTF-16BE";

395

}

396

if (b0 == 0xFF && b1 == 0xFE) {

397

// UTF-16, little-endian

398

return "UTF-16LE";

399

}

400

401

// UTF-8 with a BOM

402

int b2 = b4[2] & 0xFF;

403

if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {

404

return "UTF-8";

405

}

406

407

// other encodings

408

int b3 = b4[3] & 0xFF;

409

if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {

410

// UCS-4, big endian (1234)

411

return "ISO-10646-UCS-4";

412

}

413

if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {

414

// UCS-4, little endian (4321)

415

return "ISO-10646-UCS-4";

416

}

417

if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {

418

// UCS-4, unusual octet order (2143)

419

return "ISO-10646-UCS-4";

420

}

421

if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {

422

// UCS-4, unusual octect order (3412)

423

return "ISO-10646-UCS-4";

424

}

425

if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {

426

// UTF-16, big-endian, no BOM

427

// (or could turn out to be UCS-2...

428

return "UTF-16BE";

429

}

430

if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {

431

// UTF-16, little-endian, no BOM

432

// (or could turn out to be UCS-2...

433

return "UTF-16LE";

434

}

435

if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {

436

// EBCDIC

437

// a la xerces1, return CP037 instead of EBCDIC here

438

return "CP037";

439

}

440

441

// this signals us to use the value from the encoding attribute

442

return null;

443

444

} // getEncodingName(byte[]):Object[]

445

446

/**

447

* Read the input stream as text, and pass the text on to the XIncludeHandler

448

* using calls to characters(). This will read all of the text it can from the

449

* resource.

450

451

* @throws IOException

452

453

public void parse() throws IOException {

454

// REVISIT: This method needs to be rewritten to improve performance: both

455

// time and memory. We should be reading chunks and reporting chunks instead

456

// of reading characters individually and reporting all the characters in

457

// one callback. Also, currently we don't provide any locator information:

458

// line number, column number, etc... so if we report an error it will appear

459

// as if the invalid XML character was in the include parent. -- mrglavas

460

XMLStringBuffer buffer = new XMLStringBuffer();

461

fReader = getReader(fSource);

462

int ch;

463

while((ch = fReader.read()) != -1) {

464

if (isValid(ch)) {

465

buffer.append((char)ch);

466

}

467

else if (XMLChar.isHighSurrogate(ch)) {

468

int ch2 = fReader.read();

469

if (XMLChar.isLowSurrogate(ch2)) {

470

471

// convert surrogates to a supplemental character

472

int sup = XMLChar.supplemental((char)ch, (char)ch2);

473

474

// supplemental character must be a valid XML character

475

if (!isValid(sup)) {

476

fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,

477

"InvalidCharInContent",

478

new Object[] { Integer.toString(sup, 16) },

479

XMLErrorReporter.SEVERITY_FATAL_ERROR);

480

continue;

481

}

482

buffer.append((char) ch);

483

buffer.append((char) ch2);

484

}

485

else {

486

fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,

487

"InvalidCharInContent",

488

new Object[] { Integer.toString(ch, 16) },

489

XMLErrorReporter.SEVERITY_FATAL_ERROR);

490

}

491

}

492

else {

493

fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,

494

"InvalidCharInContent",

495

new Object[] { Integer.toString(ch, 16) },

496

XMLErrorReporter.SEVERITY_FATAL_ERROR);

497

}

498

}

499

if (fHandler != null && buffer.length > 0) {

500

fHandler.characters(

501

buffer,

502

fHandler.modifyAugmentations(null, true));

503

}

504

}

505

506

/**

507

* Closes the stream. Call this after parse(), or when there is no longer any need

508

* for this object.

509

510

* @throws IOException

511

512

public void close() throws IOException {

513

if (fReader != null) {

514

fReader.close();

515

}

516

}

517

518

/**

519

* Returns true if the specified character is a valid XML character

520

* as per the rules of XML 1.0.

521

522

* @param ch The character to check.

523

524

protected boolean isValid(int ch) {

525

return XMLChar.isValid(ch);

526

}

527

}

b'\\ No newline at end of file'

Older »