~james-page/ubuntu/raring/juju-core/1.10.0

« back to all changes in this revision

Viewing changes to src/code.google.com/p/go.net/html/token.go

Committer: James Page
Date: 2013-04-24 15:16:56 UTC
mfrom: (1.1.1)
Revision ID: james.page@canonical.com-20130424151656-xr50npm6aq4tdeb1

Recut with -1 upstream release

files added:
src/code.google.com/p/go.net/.hg_archival.txt

src/labix.org/v2/mgo/.bzr-upload.revid

src/launchpad.net/gnuflag/.bzr-upload.revid

src/launchpad.net/goamz/.bzr-upload.revid

src/launchpad.net/gocheck/.bzr-upload.revid

src/launchpad.net/gomaasapi/.bzr-upload.revid

src/launchpad.net/goose/.bzr-upload.revid

src/launchpad.net/goose/LICENSE

src/launchpad.net/goyaml/.bzr-upload.revid

src/launchpad.net/juju-core/.bzr-upload.revid

src/launchpad.net/tomb/.bzr-upload.revid

files removed:
src/code.google.com/p/go.net/html

src/code.google.com/p/go.net/html/atom

src/code.google.com/p/go.net/html/atom/atom.go

src/code.google.com/p/go.net/html/atom/atom_test.go

src/code.google.com/p/go.net/html/atom/gen.go

src/code.google.com/p/go.net/html/atom/table.go

src/code.google.com/p/go.net/html/atom/table_test.go

src/code.google.com/p/go.net/html/const.go

src/code.google.com/p/go.net/html/doc.go

src/code.google.com/p/go.net/html/doctype.go

src/code.google.com/p/go.net/html/entity.go

src/code.google.com/p/go.net/html/entity_test.go

src/code.google.com/p/go.net/html/escape.go

src/code.google.com/p/go.net/html/example_test.go

src/code.google.com/p/go.net/html/foreign.go

src/code.google.com/p/go.net/html/node.go

src/code.google.com/p/go.net/html/node_test.go

src/code.google.com/p/go.net/html/parse.go

src/code.google.com/p/go.net/html/parse_test.go

src/code.google.com/p/go.net/html/render.go

src/code.google.com/p/go.net/html/render_test.go

src/code.google.com/p/go.net/html/testdata

src/code.google.com/p/go.net/html/testdata/go1.html

src/code.google.com/p/go.net/html/testdata/webkit

src/code.google.com/p/go.net/html/testdata/webkit/README

src/code.google.com/p/go.net/html/testdata/webkit/adoption01.dat

src/code.google.com/p/go.net/html/testdata/webkit/adoption02.dat

src/code.google.com/p/go.net/html/testdata/webkit/comments01.dat

src/code.google.com/p/go.net/html/testdata/webkit/doctype01.dat

src/code.google.com/p/go.net/html/testdata/webkit/entities01.dat

src/code.google.com/p/go.net/html/testdata/webkit/entities02.dat

src/code.google.com/p/go.net/html/testdata/webkit/html5test-com.dat

src/code.google.com/p/go.net/html/testdata/webkit/inbody01.dat

src/code.google.com/p/go.net/html/testdata/webkit/isindex.dat

src/code.google.com/p/go.net/html/testdata/webkit/pending-spec-changes-plain-text-unsafe.dat

src/code.google.com/p/go.net/html/testdata/webkit/pending-spec-changes.dat

src/code.google.com/p/go.net/html/testdata/webkit/plain-text-unsafe.dat

src/code.google.com/p/go.net/html/testdata/webkit/scriptdata01.dat

src/code.google.com/p/go.net/html/testdata/webkit/scripted

src/code.google.com/p/go.net/html/testdata/webkit/scripted/adoption01.dat

src/code.google.com/p/go.net/html/testdata/webkit/scripted/webkit01.dat

src/code.google.com/p/go.net/html/testdata/webkit/tables01.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests1.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests10.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests11.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests12.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests14.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests15.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests16.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests17.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests18.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests19.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests2.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests20.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests21.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests22.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests23.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests24.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests25.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests26.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests3.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests4.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests5.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests6.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests7.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests8.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests9.dat

src/code.google.com/p/go.net/html/testdata/webkit/tests_innerHTML_1.dat

src/code.google.com/p/go.net/html/testdata/webkit/tricky01.dat

src/code.google.com/p/go.net/html/testdata/webkit/webkit01.dat

src/code.google.com/p/go.net/html/testdata/webkit/webkit02.dat

src/code.google.com/p/go.net/html/token.go

src/code.google.com/p/go.net/html/token_test.go

src/code.google.com/p/go.net/spdy/dictionary.go

src/labix.org/v2/mgo/.bzrignore

src/launchpad.net/gocheck/.bzrignore

src/launchpad.net/goose/.bzrignore

src/launchpad.net/goose/COPYING

src/launchpad.net/goyaml/.bzrignore

src/launchpad.net/juju-core/.bzrignore

src/launchpad.net/juju-core/cmd/juju/juju

src/launchpad.net/lpad

src/launchpad.net/lpad/Makefile

src/launchpad.net/lpad/archive.go

src/launchpad.net/lpad/archive_test.go

src/launchpad.net/lpad/blueprint.go

src/launchpad.net/lpad/blueprint_test.go

src/launchpad.net/lpad/branch.go

src/launchpad.net/lpad/branch_test.go

src/launchpad.net/lpad/bug.go

src/launchpad.net/lpad/bug_test.go

src/launchpad.net/lpad/build.go

src/launchpad.net/lpad/build_test.go

src/launchpad.net/lpad/builder.go

src/launchpad.net/lpad/builder_test.go

src/launchpad.net/lpad/distro.go

src/launchpad.net/lpad/distro_test.go

src/launchpad.net/lpad/model_test.go

src/launchpad.net/lpad/oauth.go

src/launchpad.net/lpad/oauth_test.go

src/launchpad.net/lpad/person.go

src/launchpad.net/lpad/person_test.go

src/launchpad.net/lpad/project.go

src/launchpad.net/lpad/project_test.go

src/launchpad.net/lpad/session.go

src/launchpad.net/lpad/session_test.go

src/launchpad.net/lpad/source.go

src/launchpad.net/lpad/source_test.go

src/launchpad.net/lpad/suite_test.go

src/launchpad.net/lpad/value.go

src/launchpad.net/lpad/value_test.go

files modified:
debian/changelog

debian/copyright

src/code.google.com/p/go.net/ipv4/control_linux.go

src/code.google.com/p/go.net/ipv4/header.go

src/code.google.com/p/go.net/ipv4/header_test.go

src/code.google.com/p/go.net/ipv4/mockicmp_test.go

src/code.google.com/p/go.net/ipv4/mocktransponder_test.go

src/code.google.com/p/go.net/ipv4/multicast_test.go

src/code.google.com/p/go.net/ipv4/multicastlistener_test.go

src/code.google.com/p/go.net/ipv4/multicastsockopt_test.go

src/code.google.com/p/go.net/ipv4/packet.go

src/code.google.com/p/go.net/ipv4/payload.go

src/code.google.com/p/go.net/ipv4/unicast_test.go

src/code.google.com/p/go.net/ipv4/unicastsockopt_test.go

src/code.google.com/p/go.net/proxy/per_host.go

src/code.google.com/p/go.net/proxy/proxy_test.go

src/code.google.com/p/go.net/proxy/socks5.go

src/code.google.com/p/go.net/publicsuffix/gen.go

src/code.google.com/p/go.net/publicsuffix/list.go

src/code.google.com/p/go.net/publicsuffix/list_test.go

src/code.google.com/p/go.net/publicsuffix/table.go

src/code.google.com/p/go.net/publicsuffix/table_test.go

src/code.google.com/p/go.net/spdy/read.go

src/code.google.com/p/go.net/spdy/spdy_test.go

src/code.google.com/p/go.net/spdy/types.go

src/code.google.com/p/go.net/spdy/write.go

src/code.google.com/p/go.net/websocket/hybi.go

src/code.google.com/p/go.net/websocket/websocket.go

src/launchpad.net/gnuflag/flag.go

src/launchpad.net/gnuflag/flag_test.go

src/launchpad.net/juju-core/cmd/juju/bootstrap.go

src/launchpad.net/juju-core/environs/cert.go

src/launchpad.net/juju-core/environs/cert_test.go

src/launchpad.net/juju-core/environs/config/home.go

Show diffs side-by-side

added added

removed removed

src/code.google.com/p/go.net/html/token.go

// Use of this source code is governed by a BSD-style

// license that can be found in the LICENSE file.

package html

import (

"bytes"

"io"

"strconv"

"strings"

"code.google.com/p/go.net/html/atom"

)

// A TokenType is the type of a Token.

type TokenType uint32

const (

// ErrorToken means that an error occurred during tokenization.

ErrorToken TokenType = iota

// TextToken means a text node.

TextToken

// A StartTagToken looks like <a>.

StartTagToken

// An EndTagToken looks like </a>.

EndTagToken

// A SelfClosingTagToken tag looks like <br/>.

SelfClosingTagToken

// A CommentToken looks like .

CommentToken

// A DoctypeToken looks like <!DOCTYPE x>

DoctypeToken

)

// String returns a string representation of the TokenType.

func (t TokenType) String() string {

switch t {

case ErrorToken:

return "Error"

case TextToken:

return "Text"

case StartTagToken:

return "StartTag"

case EndTagToken:

return "EndTag"

case SelfClosingTagToken:

return "SelfClosingTag"

case CommentToken:

return "Comment"

case DoctypeToken:

return "Doctype"

}

return "Invalid(" + strconv.Itoa(int(t)) + ")"

}

// An Attribute is an attribute namespace-key-value triple. Namespace is

// non-empty for foreign attributes like xlink, Key is alphabetic (and hence

// does not contain escapable characters like '&', '<' or '>'), and Val is

// unescaped (it looks like "a<b" rather than "a<b").

// Namespace is only used by the parser, not the tokenizer.

type Attribute struct {

Namespace, Key, Val string

}

// A Token consists of a TokenType and some Data (tag name for start and end

// tags, content for text, comments and doctypes). A tag Token may also contain

// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"

// rather than "a<b"). For tag Tokens, DataAtom is the atom for Data, or

// zero if Data is not a known tag name.

type Token struct {

Type TokenType

DataAtom atom.Atom

Data string

Attr []Attribute

}

// tagString returns a string representation of a tag Token's Data and Attr.

func (t Token) tagString() string {

if len(t.Attr) == 0 {

return t.Data

}

buf := bytes.NewBufferString(t.Data)

for _, a := range t.Attr {

buf.WriteByte(' ')

buf.WriteString(a.Key)

buf.WriteString(`="`)

escape(buf, a.Val)

buf.WriteByte('"')

}

return buf.String()

}

// String returns a string representation of the Token.

func (t Token) String() string {

switch t.Type {

case ErrorToken:

return ""

100

case TextToken:

101

return EscapeString(t.Data)

102

case StartTagToken:

103

return "<" + t.tagString() + ">"

104

case EndTagToken:

105

return "</" + t.tagString() + ">"

106

case SelfClosingTagToken:

107

return "<" + t.tagString() + "/>"

108

case CommentToken:

109

return ""

110

case DoctypeToken:

111

return "<!DOCTYPE " + t.Data + ">"

112

}

113

return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"

114

}

115

116

// span is a range of bytes in a Tokenizer's buffer. The start is inclusive,

117

// the end is exclusive.

118

type span struct {

119

start, end int

120

}

121

122

// A Tokenizer returns a stream of HTML Tokens.

123

type Tokenizer struct {

124

// r is the source of the HTML text.

125

r io.Reader

126

// tt is the TokenType of the current token.

127

tt TokenType

128

// err is the first error encountered during tokenization. It is possible

129

// for tt != Error && err != nil to hold: this means that Next returned a

130

// valid token but the subsequent Next call will return an error token.

131

// For example, if the HTML text input was just "plain", then the first

132

// Next call would set z.err to io.EOF but return a TextToken, and all

133

// subsequent Next calls would return an ErrorToken.

134

// err is never reset. Once it becomes non-nil, it stays non-nil.

135

err error

136

// buf[raw.start:raw.end] holds the raw bytes of the current token.

137

// buf[raw.end:] is buffered input that will yield future tokens.

138

raw span

139

buf []byte

140

// buf[data.start:data.end] holds the raw bytes of the current token's data:

141

// a text token's text, a tag token's tag name, etc.

142

data span

143

// pendingAttr is the attribute key and value currently being tokenized.

144

// When complete, pendingAttr is pushed onto attr. nAttrReturned is

145

// incremented on each call to TagAttr.

146

pendingAttr [2]span

147

attr [][2]span

148

nAttrReturned int

149

// rawTag is the "script" in "</script>" that closes the next token. If

150

// non-empty, the subsequent call to Next will return a raw or RCDATA text

151

// token: one that treats "<p>" as text instead of an element.

152

// rawTag's contents are lower-cased.

153

rawTag string

154

// textIsRaw is whether the current text token's data is not escaped.

155

textIsRaw bool

156

// convertNUL is whether NUL bytes in the current token's data should

157

// be converted into \ufffd replacement characters.

158

convertNUL bool

159

// allowCDATA is whether CDATA sections are allowed in the current context.

160

allowCDATA bool

161

}

162

163

// AllowCDATA sets whether or not the tokenizer recognizes <![CDATA[foo]]> as

164

// the text "foo". The default value is false, which means to recognize it as

165

// a bogus comment "" instead.

166

167

// Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and

168

// only if tokenizing foreign content, such as MathML and SVG. However,

169

// tracking foreign-contentness is difficult to do purely in the tokenizer,

170

// as opposed to the parser, due to HTML integration points: an <svg> element

171

// can contain a <foreignObject> that is foreign-to-SVG but not foreign-to-

172

// HTML. For strict compliance with the HTML5 tokenization algorithm, it is the

173

// responsibility of the user of a tokenizer to call AllowCDATA as appropriate.

174

// In practice, if using the tokenizer without caring whether MathML or SVG

175

// CDATA is text or comments, such as tokenizing HTML to find all the anchor

176

// text, it is acceptable to ignore this responsibility.

177

func (z *Tokenizer) AllowCDATA(allowCDATA bool) {

178

z.allowCDATA = allowCDATA

179

}

180

181

// NextIsNotRawText instructs the tokenizer that the next token should not be

182

// considered as 'raw text'. Some elements, such as script and title elements,

183

// normally require the next token after the opening tag to be 'raw text' that

184

// has no child elements. For example, tokenizing "<title>a<b>c</b>d</title>"

185

// yields a start tag token for "<title>", a text token for "a<b>c</b>d", and

186

// an end tag token for "</title>". There are no distinct start tag or end tag

187

// tokens for the "<b>" and "</b>".

188

189

// This tokenizer implementation will generally look for raw text at the right

190

// times. Strictly speaking, an HTML5 compliant tokenizer should not look for

191

// raw text if in foreign content: <title> generally needs raw text, but a

192

// <title> inside an <svg> does not. Another example is that a <textarea>

193

// generally needs raw text, but a <textarea> is not allowed as an immediate

194

// child of a <select>; in normal parsing, a <textarea> implies </select>, but

195

// one cannot close the implicit element when parsing a <select>'s InnerHTML.

196

// Similarly to AllowCDATA, tracking the correct moment to override raw-text-

197

// ness is difficult to do purely in the tokenizer, as opposed to the parser.

198

// For strict compliance with the HTML5 tokenization algorithm, it is the

199

// responsibility of the user of a tokenizer to call NextIsNotRawText as

200

// appropriate. In practice, like AllowCDATA, it is acceptable to ignore this

201

// responsibility for basic usage.

202

203

// Note that this 'raw text' concept is different from the one offered by the

204

// Tokenizer.Raw method.

205

func (z *Tokenizer) NextIsNotRawText() {

206

z.rawTag = ""

207

}

208

209

// Err returns the error associated with the most recent ErrorToken token.

210

// This is typically io.EOF, meaning the end of tokenization.

211

func (z *Tokenizer) Err() error {

212

if z.tt != ErrorToken {

213

return nil

214

}

215

return z.err

216

}

217

218

// readByte returns the next byte from the input stream, doing a buffered read

219

// from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte

220

// slice that holds all the bytes read so far for the current token.

221

// It sets z.err if the underlying reader returns an error.

222

// Pre-condition: z.err == nil.

223

func (z *Tokenizer) readByte() byte {

224

if z.raw.end >= len(z.buf) {

225

// Our buffer is exhausted and we have to read from z.r.

226

// We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length

227

// z.raw.end - z.raw.start is more than half the capacity of z.buf, then we

228

// allocate a new buffer before the copy.

229

c := cap(z.buf)

230

d := z.raw.end - z.raw.start

231

var buf1 []byte

232

if 2*d > c {

233

buf1 = make([]byte, d, 2*c)

234

} else {

235

buf1 = z.buf[:d]

236

}

237

copy(buf1, z.buf[z.raw.start:z.raw.end])

238

if x := z.raw.start; x != 0 {

239

// Adjust the data/attr spans to refer to the same contents after the copy.

240

z.data.start -= x

241

z.data.end -= x

242

z.pendingAttr[0].start -= x

243

z.pendingAttr[0].end -= x

244

z.pendingAttr[1].start -= x

245

z.pendingAttr[1].end -= x

246

for i := range z.attr {

247

z.attr[i][0].start -= x

248

z.attr[i][0].end -= x

249

z.attr[i][1].start -= x

250

z.attr[i][1].end -= x

251

}

252

}

253

z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d]

254

// Now that we have copied the live bytes to the start of the buffer,

255

// we read from z.r into the remainder.

256

n, err := z.r.Read(buf1[d:cap(buf1)])

257

if err != nil {

258

z.err = err

259

return 0

260

}

261

z.buf = buf1[:d+n]

262

}

263

x := z.buf[z.raw.end]

264

z.raw.end++

265

return x

266

}

267

268

// skipWhiteSpace skips past any white space.

269

func (z *Tokenizer) skipWhiteSpace() {

270

if z.err != nil {

271

return

272

}

273

for {

274

c := z.readByte()

275

if z.err != nil {

276

return

277

}

278

switch c {

279

case ' ', '\n', '\r', '\t', '\f':

280

// No-op.

281

default:

282

z.raw.end--

283

return

284

}

285

}

286

}

287

288

// readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and

289

// is typically something like "script" or "textarea".

290

func (z *Tokenizer) readRawOrRCDATA() {

291

if z.rawTag == "script" {

292

z.readScript()

293

z.textIsRaw = true

294

z.rawTag = ""

295

return

296

}

297

loop:

298

for {

299

c := z.readByte()

300

if z.err != nil {

301

break loop

302

}

303

if c != '<' {

304

continue loop

305

}

306

c = z.readByte()

307

if z.err != nil {

308

break loop

309

}

310

if c != '/' {

311

continue loop

312

}

313

if z.readRawEndTag() || z.err != nil {

314

break loop

315

}

316

}

317

z.data.end = z.raw.end

318

// A textarea's or title's RCDATA can contain escaped entities.

319

z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title"

320

z.rawTag = ""

321

}

322

323

// readRawEndTag attempts to read a tag like "</foo>", where "foo" is z.rawTag.

324

// If it succeeds, it backs up the input position to reconsume the tag and

325

// returns true. Otherwise it returns false. The opening "</" has already been

326

// consumed.

327

func (z *Tokenizer) readRawEndTag() bool {

328

for i := 0; i < len(z.rawTag); i++ {

329

c := z.readByte()

330

if z.err != nil {

331

return false

332

}

333

if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') {

334

z.raw.end--

335

return false

336

}

337

}

338

c := z.readByte()

339

if z.err != nil {

340

return false

341

}

342

switch c {

343

case ' ', '\n', '\r', '\t', '\f', '/', '>':

344

// The 3 is 2 for the leading "</" plus 1 for the trailing character c.

345

z.raw.end -= 3 + len(z.rawTag)

346

return true

347

}

348

z.raw.end--

349

return false

350

}

351

352

// readScript reads until the next </script> tag, following the byzantine

353

// rules for escaping/hiding the closing tag.

354

func (z *Tokenizer) readScript() {

355

defer func() {

356

z.data.end = z.raw.end

357

}()

358

var c byte

359

360

scriptData:

361

c = z.readByte()

362

if z.err != nil {

363

return

364

}

365

if c == '<' {

366

goto scriptDataLessThanSign

367

}

368

goto scriptData

369

370

scriptDataLessThanSign:

371

c = z.readByte()

372

if z.err != nil {

373

return

374

}

375

switch c {

376

case '/':

377

goto scriptDataEndTagOpen

378

case '!':

379

goto scriptDataEscapeStart

380

}

381

z.raw.end--

382

goto scriptData

383

384

scriptDataEndTagOpen:

385

if z.readRawEndTag() || z.err != nil {

386

return

387

}

388

goto scriptData

389

390

scriptDataEscapeStart:

391

c = z.readByte()

392

if z.err != nil {

393

return

394

}

395

if c == '-' {

396

goto scriptDataEscapeStartDash

397

}

398

z.raw.end--

399

goto scriptData

400

401

scriptDataEscapeStartDash:

402

c = z.readByte()

403

if z.err != nil {

404

return

405

}

406

if c == '-' {

407

goto scriptDataEscapedDashDash

408

}

409

z.raw.end--

410

goto scriptData

411

412

scriptDataEscaped:

413

c = z.readByte()

414

if z.err != nil {

415

return

416

}

417

switch c {

418

case '-':

419

goto scriptDataEscapedDash

420

case '<':

421

goto scriptDataEscapedLessThanSign

422

}

423

goto scriptDataEscaped

424

425

scriptDataEscapedDash:

426

c = z.readByte()

427

if z.err != nil {

428

return

429

}

430

switch c {

431

case '-':

432

goto scriptDataEscapedDashDash

433

case '<':

434

goto scriptDataEscapedLessThanSign

435

}

436

goto scriptDataEscaped

437

438

scriptDataEscapedDashDash:

439

c = z.readByte()

440

if z.err != nil {

441

return

442

}

443

switch c {

444

case '-':

445

goto scriptDataEscapedDashDash

446

case '<':

447

goto scriptDataEscapedLessThanSign

448

case '>':

449

goto scriptData

450

}

451

goto scriptDataEscaped

452

453

scriptDataEscapedLessThanSign:

454

c = z.readByte()

455

if z.err != nil {

456

return

457

}

458

if c == '/' {

459

goto scriptDataEscapedEndTagOpen

460

}

461

if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {

462

goto scriptDataDoubleEscapeStart

463

}

464

z.raw.end--

465

goto scriptData

466

467

scriptDataEscapedEndTagOpen:

468

if z.readRawEndTag() || z.err != nil {

469

return

470

}

471

goto scriptDataEscaped

472

473

scriptDataDoubleEscapeStart:

474

z.raw.end--

475

for i := 0; i < len("script"); i++ {

476

c = z.readByte()

477

if z.err != nil {

478

return

479

}

480

if c != "script"[i] && c != "SCRIPT"[i] {

481

z.raw.end--

482

goto scriptDataEscaped

483

}

484

}

485

c = z.readByte()

486

if z.err != nil {

487

return

488

}

489

switch c {

490

case ' ', '\n', '\r', '\t', '\f', '/', '>':

491

goto scriptDataDoubleEscaped

492

}

493

z.raw.end--

494

goto scriptDataEscaped

495

496

scriptDataDoubleEscaped:

497

c = z.readByte()

498

if z.err != nil {

499

return

500

}

501

switch c {

502

case '-':

503

goto scriptDataDoubleEscapedDash

504

case '<':

505

goto scriptDataDoubleEscapedLessThanSign

506

}

507

goto scriptDataDoubleEscaped

508

509

scriptDataDoubleEscapedDash:

510

c = z.readByte()

511

if z.err != nil {

512

return

513

}

514

switch c {

515

case '-':

516

goto scriptDataDoubleEscapedDashDash

517

case '<':

518

goto scriptDataDoubleEscapedLessThanSign

519

}

520

goto scriptDataDoubleEscaped

521

522

scriptDataDoubleEscapedDashDash:

523

c = z.readByte()

524

if z.err != nil {

525

return

526

}

527

switch c {

528

case '-':

529

goto scriptDataDoubleEscapedDashDash

530

case '<':

531

goto scriptDataDoubleEscapedLessThanSign

532

case '>':

533

goto scriptData

534

}

535

goto scriptDataDoubleEscaped

536

537

scriptDataDoubleEscapedLessThanSign:

538

c = z.readByte()

539

if z.err != nil {

540

return

541

}

542

if c == '/' {

543

goto scriptDataDoubleEscapeEnd

544

}

545

z.raw.end--

546

goto scriptDataDoubleEscaped

547

548

scriptDataDoubleEscapeEnd:

549

if z.readRawEndTag() {

550

z.raw.end += len("</script>")

551

goto scriptDataEscaped

552

}

553

if z.err != nil {

554

return

555

}

556

goto scriptDataDoubleEscaped

557

}

558

559

// readComment reads the next comment token starting with "<!--". The opening

560

// "<!--" has already been consumed.

561

func (z *Tokenizer) readComment() {

562

z.data.start = z.raw.end

563

defer func() {

564

if z.data.end < z.data.start {

565

// It's a comment with no data, like <!-->.

566

z.data.end = z.data.start

567

}

568

}()

569

for dashCount := 2; ; {

570

c := z.readByte()

571

if z.err != nil {

572

// Ignore up to two dashes at EOF.

573

if dashCount > 2 {

574

dashCount = 2

575

}

576

z.data.end = z.raw.end - dashCount

577

return

578

}

579

switch c {

580

case '-':

581

dashCount++

582

continue

583

case '>':

584

if dashCount >= 2 {

585

z.data.end = z.raw.end - len("-->")

586

return

587

}

588

case '!':

589

if dashCount >= 2 {

590

c = z.readByte()

591

if z.err != nil {

592

z.data.end = z.raw.end

593

return

594

}

595

if c == '>' {

596

z.data.end = z.raw.end - len("--!>")

597

return

598

}

599

}

600

}

601

dashCount = 0

602

}

603

}

604

605

// readUntilCloseAngle reads until the next ">".

606

func (z *Tokenizer) readUntilCloseAngle() {

607

z.data.start = z.raw.end

608

for {

609

c := z.readByte()

610

if z.err != nil {

611

z.data.end = z.raw.end

612

return

613

}

614

if c == '>' {

615

z.data.end = z.raw.end - len(">")

616

return

617

}

618

}

619

}

620

621

// readMarkupDeclaration reads the next token starting with "<!". It might be

622

// a "", a "<!DOCTYPE foo>", a "<![CDATA[section]]>" or

623

// "<!a bogus comment". The opening "<!" has already been consumed.

624

func (z *Tokenizer) readMarkupDeclaration() TokenType {

625

z.data.start = z.raw.end

626

var c [2]byte

627

for i := 0; i < 2; i++ {

628

c[i] = z.readByte()

629

if z.err != nil {

630

z.data.end = z.raw.end

631

return CommentToken

632

}

633

}

634

if c[0] == '-' && c[1] == '-' {

635

z.readComment()

636

return CommentToken

637

}

638

z.raw.end -= 2

639

if z.readDoctype() {

640

return DoctypeToken

641

}

642

if z.allowCDATA && z.readCDATA() {

643

z.convertNUL = true

644

return TextToken

645

}

646

// It's a bogus comment.

647

z.readUntilCloseAngle()

648

return CommentToken

649

}

650

651

// readDoctype attempts to read a doctype declaration and returns true if

652

// successful. The opening "<!" has already been consumed.

653

func (z *Tokenizer) readDoctype() bool {

654

const s = "DOCTYPE"

655

for i := 0; i < len(s); i++ {

656

c := z.readByte()

657

if z.err != nil {

658

z.data.end = z.raw.end

659

return false

660

}

661

if c != s[i] && c != s[i]+('a'-'A') {

662

// Back up to read the fragment of "DOCTYPE" again.

663

z.raw.end = z.data.start

664

return false

665

}

666

}

667

if z.skipWhiteSpace(); z.err != nil {

668

z.data.start = z.raw.end

669

z.data.end = z.raw.end

670

return true

671

}

672

z.readUntilCloseAngle()

673

return true

674

}

675

676

// readCDATA attempts to read a CDATA section and returns true if

677

// successful. The opening "<!" has already been consumed.

678

func (z *Tokenizer) readCDATA() bool {

679

const s = "[CDATA["

680

for i := 0; i < len(s); i++ {

681

c := z.readByte()

682

if z.err != nil {

683

z.data.end = z.raw.end

684

return false

685

}

686

if c != s[i] {

687

// Back up to read the fragment of "[CDATA[" again.

688

z.raw.end = z.data.start

689

return false

690

}

691

}

692

z.data.start = z.raw.end

693

brackets := 0

694

for {

695

c := z.readByte()

696

if z.err != nil {

697

z.data.end = z.raw.end

698

return true

699

}

700

switch c {

701

case ']':

702

brackets++

703

case '>':

704

if brackets >= 2 {

705

z.data.end = z.raw.end - len("]]>")

706

return true

707

}

708

brackets = 0

709

default:

710

brackets = 0

711

}

712

}

713

panic("unreachable")

714

}

715

716

// startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end]

717

// case-insensitively matches any element of ss.

718

func (z *Tokenizer) startTagIn(ss ...string) bool {

719

loop:

720

for _, s := range ss {

721

if z.data.end-z.data.start != len(s) {

722

continue loop

723

}

724

for i := 0; i < len(s); i++ {

725

c := z.buf[z.data.start+i]

726

if 'A' <= c && c <= 'Z' {

727

c += 'a' - 'A'

728

}

729

if c != s[i] {

730

continue loop

731

}

732

}

733

return true

734

}

735

return false

736

}

737

738

// readStartTag reads the next start tag token. The opening "<a" has already

739

// been consumed, where 'a' means anything in [A-Za-z].

740

func (z *Tokenizer) readStartTag() TokenType {

741

z.readTag(true)

742

if z.err != nil {

743

return ErrorToken

744

}

745

// Several tags flag the tokenizer's next token as raw.

746

c, raw := z.buf[z.data.start], false

747

if 'A' <= c && c <= 'Z' {

748

c += 'a' - 'A'

749

}

750

switch c {

751

case 'i':

752

raw = z.startTagIn("iframe")

753

case 'n':

754

raw = z.startTagIn("noembed", "noframes", "noscript")

755

case 'p':

756

raw = z.startTagIn("plaintext")

757

case 's':

758

raw = z.startTagIn("script", "style")

759

case 't':

760

raw = z.startTagIn("textarea", "title")

761

case 'x':

762

raw = z.startTagIn("xmp")

763

}

764

if raw {

765

z.rawTag = strings.ToLower(string(z.buf[z.data.start:z.data.end]))

766

}

767

// Look for a self-closing token like "<br/>".

768

if z.err == nil && z.buf[z.raw.end-2] == '/' {

769

return SelfClosingTagToken

770

}

771

return StartTagToken

772

}

773

774

// readTag reads the next tag token and its attributes. If saveAttr, those

775

// attributes are saved in z.attr, otherwise z.attr is set to an empty slice.

776

// The opening "<a" or "</a" has already been consumed, where 'a' means anything

777

// in [A-Za-z].

778

func (z *Tokenizer) readTag(saveAttr bool) {

779

z.attr = z.attr[:0]

780

z.nAttrReturned = 0

781

// Read the tag name and attribute key/value pairs.

782

z.readTagName()

783

if z.skipWhiteSpace(); z.err != nil {

784

return

785

}

786

for {

787

c := z.readByte()

788

if z.err != nil || c == '>' {

789

break

790

}

791

z.raw.end--

792

z.readTagAttrKey()

793

z.readTagAttrVal()

794

// Save pendingAttr if saveAttr and that attribute has a non-empty key.

795

if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end {

796

z.attr = append(z.attr, z.pendingAttr)

797

}

798

if z.skipWhiteSpace(); z.err != nil {

799

break

800

}

801

}

802

}

803

804

// readTagName sets z.data to the "div" in "<div k=v>". The reader (z.raw.end)

805

// is positioned such that the first byte of the tag name (the "d" in "<div")

806

// has already been consumed.

807

func (z *Tokenizer) readTagName() {

808

z.data.start = z.raw.end - 1

809

for {

810

c := z.readByte()

811

if z.err != nil {

812

z.data.end = z.raw.end

813

return

814

}

815

switch c {

816

case ' ', '\n', '\r', '\t', '\f':

817

z.data.end = z.raw.end - 1

818

return

819

case '/', '>':

820

z.raw.end--

821

z.data.end = z.raw.end

822

return

823

}

824

}

825

}

826

827

// readTagAttrKey sets z.pendingAttr[0] to the "k" in "<div k=v>".

828

// Precondition: z.err == nil.

829

func (z *Tokenizer) readTagAttrKey() {

830

z.pendingAttr[0].start = z.raw.end

831

for {

832

c := z.readByte()

833

if z.err != nil {

834

z.pendingAttr[0].end = z.raw.end

835

return

836

}

837

switch c {

838

case ' ', '\n', '\r', '\t', '\f', '/':

839

z.pendingAttr[0].end = z.raw.end - 1

840

return

841

case '=', '>':

842

z.raw.end--

843

z.pendingAttr[0].end = z.raw.end

844

return

845

}

846

}

847

}

848

849

// readTagAttrVal sets z.pendingAttr[1] to the "v" in "<div k=v>".

850

func (z *Tokenizer) readTagAttrVal() {

851

z.pendingAttr[1].start = z.raw.end

852

z.pendingAttr[1].end = z.raw.end

853

if z.skipWhiteSpace(); z.err != nil {

854

return

855

}

856

c := z.readByte()

857

if z.err != nil {

858

return

859

}

860

if c != '=' {

861

z.raw.end--

862

return

863

}

864

if z.skipWhiteSpace(); z.err != nil {

865

return

866

}

867

quote := z.readByte()

868

if z.err != nil {

869

return

870

}

871

switch quote {

872

case '>':

873

z.raw.end--

874

return

875

876

case '\'', '"':

877

z.pendingAttr[1].start = z.raw.end

878

for {

879

c := z.readByte()

880

if z.err != nil {

881

z.pendingAttr[1].end = z.raw.end

882

return

883

}

884

if c == quote {

885

z.pendingAttr[1].end = z.raw.end - 1

886

return

887

}

888

}

889

890

default:

891

z.pendingAttr[1].start = z.raw.end - 1

892

for {

893

c := z.readByte()

894

if z.err != nil {

895

z.pendingAttr[1].end = z.raw.end

896

return

897

}

898

switch c {

899

case ' ', '\n', '\r', '\t', '\f':

900

z.pendingAttr[1].end = z.raw.end - 1

901

return

902

case '>':

903

z.raw.end--

904

z.pendingAttr[1].end = z.raw.end

905

return

906

}

907

}

908

}

909

}

910

911

// Next scans the next token and returns its type.

912

func (z *Tokenizer) Next() TokenType {

913

if z.err != nil {

914

z.tt = ErrorToken

915

return z.tt

916

}

917

z.raw.start = z.raw.end

918

z.data.start = z.raw.end

919

z.data.end = z.raw.end

920

if z.rawTag != "" {

921

if z.rawTag == "plaintext" {

922

// Read everything up to EOF.

923

for z.err == nil {

924

z.readByte()

925

}

926

z.data.end = z.raw.end

927

z.textIsRaw = true

928

} else {

929

z.readRawOrRCDATA()

930

}

931

if z.data.end > z.data.start {

932

z.tt = TextToken

933

z.convertNUL = true

934

return z.tt

935

}

936

}

937

z.textIsRaw = false

938

z.convertNUL = false

939

940

loop:

941

for {

942

c := z.readByte()

943

if z.err != nil {

944

break loop

945

}

946

if c != '<' {

947

continue loop

948

}

949

950

// Check if the '<' we have just read is part of a tag, comment

951

// or doctype. If not, it's part of the accumulated text token.

952

c = z.readByte()

953

if z.err != nil {

954

break loop

955

}

956

var tokenType TokenType

957

switch {

958

case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':

959

tokenType = StartTagToken

960

case c == '/':

961

tokenType = EndTagToken

962

case c == '!' || c == '?':

963

// We use CommentToken to mean any of "",

964

// "<!DOCTYPE declarations>" and "<?xml processing instructions?>".

965

tokenType = CommentToken

966

default:

967

continue

968

}

969

970

// We have a non-text token, but we might have accumulated some text

971

// before that. If so, we return the text first, and return the non-

972

// text token on the subsequent call to Next.

973

if x := z.raw.end - len("<a"); z.raw.start < x {

974

z.raw.end = x

975

z.data.end = x

976

z.tt = TextToken

977

return z.tt

978

}

979

switch tokenType {

980

case StartTagToken:

981

z.tt = z.readStartTag()

982

return z.tt

983

case EndTagToken:

984

c = z.readByte()

985

if z.err != nil {

986

break loop

987

}

988

if c == '>' {

989

// "</>" does not generate a token at all.

990

// Reset the tokenizer state and start again.

991

z.raw.start = z.raw.end

992

z.data.start = z.raw.end

993

z.data.end = z.raw.end

994

continue loop

995

}

996

if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {

997

z.readTag(false)

998

if z.err != nil {

999

z.tt = ErrorToken

1000

} else {

1001

z.tt = EndTagToken

1002

}

1003

return z.tt

1004

}

1005

z.raw.end--

1006

z.readUntilCloseAngle()

1007

z.tt = CommentToken

1008

return z.tt

1009

case CommentToken:

1010

if c == '!' {

1011

z.tt = z.readMarkupDeclaration()

1012

return z.tt

1013

}

1014

z.raw.end--

1015

z.readUntilCloseAngle()

1016

z.tt = CommentToken

1017

return z.tt

1018

}

1019

}

1020

if z.raw.start < z.raw.end {

1021

z.data.end = z.raw.end

1022

z.tt = TextToken

1023

return z.tt

1024

}

1025

z.tt = ErrorToken

1026

return z.tt

1027

}

1028

1029

// Raw returns the unmodified text of the current token. Calling Next, Token,

1030

// Text, TagName or TagAttr may change the contents of the returned slice.

1031

func (z *Tokenizer) Raw() []byte {

1032

return z.buf[z.raw.start:z.raw.end]

1033

}

1034

1035

// convertNewlines converts "\r" and "\r\n" in s to "\n".

1036

// The conversion happens in place, but the resulting slice may be shorter.

1037

func convertNewlines(s []byte) []byte {

1038

for i, c := range s {

1039

if c != '\r' {

1040

continue

1041

}

1042

1043

src := i + 1

1044

if src >= len(s) || s[src] != '\n' {

1045

s[i] = '\n'

1046

continue

1047

}

1048

1049

dst := i

1050

for src < len(s) {

1051

if s[src] == '\r' {

1052

if src+1 < len(s) && s[src+1] == '\n' {

1053

src++

1054

}

1055

s[dst] = '\n'

1056

} else {

1057

s[dst] = s[src]

1058

}

1059

src++

1060

dst++

1061

}

1062

return s[:dst]

1063

}

1064

return s

1065

}

1066

1067

var (

1068

nul = []byte("\x00")

1069

replacement = []byte("\ufffd")

1070

)

1071

1072

// Text returns the unescaped text of a text, comment or doctype token. The

1073

// contents of the returned slice may change on the next call to Next.

1074

func (z *Tokenizer) Text() []byte {

1075

switch z.tt {

1076

case TextToken, CommentToken, DoctypeToken:

1077

s := z.buf[z.data.start:z.data.end]

1078

z.data.start = z.raw.end

1079

z.data.end = z.raw.end

1080

s = convertNewlines(s)

1081

if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) {

1082

s = bytes.Replace(s, nul, replacement, -1)

1083

}

1084

if !z.textIsRaw {

1085

s = unescape(s, false)

1086

}

1087

return s

1088

}

1089

return nil

1090

}

1091

1092

// TagName returns the lower-cased name of a tag token (the `img` out of

1093

// `<IMG SRC="foo">`) and whether the tag has attributes.

1094

// The contents of the returned slice may change on the next call to Next.

1095

func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {

1096

if z.data.start < z.data.end {

1097

switch z.tt {

1098

case StartTagToken, EndTagToken, SelfClosingTagToken:

1099

s := z.buf[z.data.start:z.data.end]

1100

z.data.start = z.raw.end

1101

z.data.end = z.raw.end

1102

return lower(s), z.nAttrReturned < len(z.attr)

1103

}

1104

}

1105

return nil, false

1106

}

1107

1108

// TagAttr returns the lower-cased key and unescaped value of the next unparsed

1109

// attribute for the current tag token and whether there are more attributes.

1110

// The contents of the returned slices may change on the next call to Next.

1111

func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {

1112

if z.nAttrReturned < len(z.attr) {

1113

switch z.tt {

1114

case StartTagToken, SelfClosingTagToken:

1115

x := z.attr[z.nAttrReturned]

1116

z.nAttrReturned++

1117

key = z.buf[x[0].start:x[0].end]

1118

val = z.buf[x[1].start:x[1].end]

1119

return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr)

1120

}

1121

}

1122

return nil, nil, false

1123

}

1124

1125

// Token returns the next Token. The result's Data and Attr values remain valid

1126

// after subsequent Next calls.

1127

func (z *Tokenizer) Token() Token {

1128

t := Token{Type: z.tt}

1129

switch z.tt {

1130

case TextToken, CommentToken, DoctypeToken:

1131

t.Data = string(z.Text())

1132

case StartTagToken, SelfClosingTagToken, EndTagToken:

1133

name, moreAttr := z.TagName()

1134

for moreAttr {

1135

var key, val []byte

1136

key, val, moreAttr = z.TagAttr()

1137

t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)})

1138

}

1139

if a := atom.Lookup(name); a != 0 {

1140

t.DataAtom, t.Data = a, a.String()

1141

} else {

1142

t.DataAtom, t.Data = 0, string(name)

1143

}

1144

}

1145

return t

1146

}

1147

1148

// NewTokenizer returns a new HTML Tokenizer for the given Reader.

1149

// The input is assumed to be UTF-8 encoded.

1150

func NewTokenizer(r io.Reader) *Tokenizer {

1151

return NewTokenizerFragment(r, "")

1152

}

1153

1154

// NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for

1155

// tokenizing an exisitng element's InnerHTML fragment. contextTag is that

1156

// element's tag, such as "div" or "iframe".

1157

1158

// For example, how the InnerHTML "a<b" is tokenized depends on whether it is

1159

// for a <p> tag or a <script> tag.

1160

1161

// The input is assumed to be UTF-8 encoded.

1162

func NewTokenizerFragment(r io.Reader, contextTag string) *Tokenizer {

1163

z := &Tokenizer{

1164

r: r,

1165

buf: make([]byte, 0, 4096),

1166

}

1167

if contextTag != "" {

1168

switch s := strings.ToLower(contextTag); s {

1169

case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp":

1170

z.rawTag = s

1171

}

1172

}

1173

return z

1174

}

Older »