1
// Copyright 2010 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
13
// These replacements permit compatibility with old numeric entities that
14
// assumed Windows-1252 encoding.
15
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
16
var replacementTable = [...]rune{
17
'\u20AC', // First entry is what 0x80 should be replaced with.
48
'\u0178', // Last entry is 0x9F.
49
// 0x00->'\uFFFD' is handled programmatically.
50
// 0x0D->'\u000D' is a no-op.
53
// unescapeEntity reads an entity like "<" from b[src:] and writes the
54
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
55
// Precondition: b[src] == '&' && dst <= src.
56
// attribute should be true if parsing an attribute value.
57
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
58
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
60
// i starts at 1 because we already know that s[0] == '&'.
65
return dst + 1, src + 1
69
if len(s) <= 3 { // We need to have at least "&#.".
71
return dst + 1, src + 1
76
if c == 'x' || c == 'X' {
86
if '0' <= c && c <= '9' {
87
x = 16*x + rune(c) - '0'
89
} else if 'a' <= c && c <= 'f' {
90
x = 16*x + rune(c) - 'a' + 10
92
} else if 'A' <= c && c <= 'F' {
93
x = 16*x + rune(c) - 'A' + 10
96
} else if '0' <= c && c <= '9' {
97
x = 10*x + rune(c) - '0'
106
if i <= 3 { // No characters matched.
108
return dst + 1, src + 1
111
if 0x80 <= x && x <= 0x9F {
112
// Replace characters from Windows-1252 with UTF-8 equivalents.
113
x = replacementTable[x-0x80]
114
} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
115
// Replace invalid characters with the replacement character.
119
return dst + utf8.EncodeRune(b[dst:], x), src + i
122
// Consume the maximum number of characters possible, with the
123
// consumed characters matching one of the named references.
128
// Lower-cased characters are more common in entities, so we check for them first.
129
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
138
entityName := string(s[1:i])
139
if entityName == "" {
141
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
143
} else if x := entity[entityName]; x != 0 {
144
return dst + utf8.EncodeRune(b[dst:], x), src + i
145
} else if x := entity2[entityName]; x[0] != 0 {
146
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
147
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
148
} else if !attribute {
149
maxLen := len(entityName) - 1
150
if maxLen > longestEntityWithoutSemicolon {
151
maxLen = longestEntityWithoutSemicolon
153
for j := maxLen; j > 1; j-- {
154
if x := entity[entityName[:j]]; x != 0 {
155
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
160
dst1, src1 = dst+i, src+i
161
copy(b[dst:dst1], b[src:src1])
165
// unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".
166
// attribute should be true if parsing an attribute value.
167
func unescape(b []byte, attribute bool) []byte {
168
for i, c := range b {
170
dst, src := unescapeEntity(b, i, i, attribute)
174
dst, src = unescapeEntity(b, dst, src, attribute)
177
dst, src = dst+1, src+1
186
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
187
func lower(b []byte) []byte {
188
for i, c := range b {
189
if 'A' <= c && c <= 'Z' {
196
const escapedChars = "&'<>\"\r"
198
func escape(w writer, s string) error {
199
i := strings.IndexAny(s, escapedChars)
201
if _, err := w.WriteString(s[:i]); err != nil {
209
// "'" is shorter than "'" and apos was not in HTML until HTML5.
216
// """ is shorter than """.
221
panic("unrecognized escape character")
224
if _, err := w.WriteString(esc); err != nil {
227
i = strings.IndexAny(s, escapedChars)
229
_, err := w.WriteString(s)
233
// EscapeString escapes special characters like "<" to become "<". It
234
// escapes only five such characters: <, >, &, ' and ".
235
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
237
func EscapeString(s string) string {
238
if strings.IndexAny(s, escapedChars) == -1 {
246
// UnescapeString unescapes entities like "<" to become "<". It unescapes a
247
// larger range of entities than EscapeString escapes. For example, "á"
248
// unescapes to "á", as does "á" and "&xE1;".
249
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
251
func UnescapeString(s string) string {
252
for _, c := range s {
254
return string(unescape([]byte(s), false))