Skip to content

Commit ee01d0c

Browse files
committed
feat: port html go internal package
1 parent 2e56ecf commit ee01d0c

File tree

7 files changed

+2727
-1
lines changed

7 files changed

+2727
-1
lines changed

docs/reference/go-gno-compatibility.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ Legend:
184184
| hash/crc64 | `todo` |
185185
| hash/fnv | `todo` |
186186
| hash/maphash | `todo` |
187-
| html | `todo` |
187+
| html | `full` |
188188
| html/template | `todo` |
189189
| image | `tbd` |
190190
| image/color | `tbd` |

gnovm/stdlibs/generated.go

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

gnovm/stdlibs/html/entity.gno

+2,263
Large diffs are not rendered by default.

gnovm/stdlibs/html/entity_test.gno

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// Copyright 2010 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package html
6+
7+
import (
8+
"testing"
9+
"unicode/utf8"
10+
)
11+
12+
func init() {
13+
UnescapeString("") // force load of entity maps
14+
}
15+
16+
func TestEntityLength(t *testing.T) {
17+
if len(entity) == 0 || len(entity2) == 0 {
18+
t.Fatal("maps not loaded")
19+
}
20+
21+
// We verify that the length of UTF-8 encoding of each value is <= 1 + len(key).
22+
// The +1 comes from the leading "&". This property implies that the length of
23+
// unescaped text is <= the length of escaped text.
24+
for k, v := range entity {
25+
if 1+len(k) < utf8.RuneLen(v) {
26+
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
27+
}
28+
if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' {
29+
t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon)
30+
}
31+
}
32+
for k, v := range entity2 {
33+
if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {
34+
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v[0]) + string(v[1]))
35+
}
36+
}
37+
}

gnovm/stdlibs/html/escape.gno

+217
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
// Copyright 2010 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// Package html provides functions for escaping and unescaping HTML text.
6+
package html
7+
8+
import (
9+
"strings"
10+
"unicode/utf8"
11+
)
12+
13+
// These replacements permit compatibility with old numeric entities that
14+
// assumed Windows-1252 encoding.
15+
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
16+
var replacementTable = [...]rune{
17+
'\u20AC', // First entry is what 0x80 should be replaced with.
18+
'\u0081',
19+
'\u201A',
20+
'\u0192',
21+
'\u201E',
22+
'\u2026',
23+
'\u2020',
24+
'\u2021',
25+
'\u02C6',
26+
'\u2030',
27+
'\u0160',
28+
'\u2039',
29+
'\u0152',
30+
'\u008D',
31+
'\u017D',
32+
'\u008F',
33+
'\u0090',
34+
'\u2018',
35+
'\u2019',
36+
'\u201C',
37+
'\u201D',
38+
'\u2022',
39+
'\u2013',
40+
'\u2014',
41+
'\u02DC',
42+
'\u2122',
43+
'\u0161',
44+
'\u203A',
45+
'\u0153',
46+
'\u009D',
47+
'\u017E',
48+
'\u0178', // Last entry is 0x9F.
49+
// 0x00->'\uFFFD' is handled programmatically.
50+
// 0x0D->'\u000D' is a no-op.
51+
}
52+
53+
// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
54+
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
55+
// Precondition: b[src] == '&' && dst <= src.
56+
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
57+
const attribute = false
58+
59+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
60+
61+
// i starts at 1 because we already know that s[0] == '&'.
62+
i, s := 1, b[src:]
63+
64+
if len(s) <= 1 {
65+
b[dst] = b[src]
66+
return dst + 1, src + 1
67+
}
68+
69+
if s[i] == '#' {
70+
if len(s) <= 3 { // We need to have at least "&#.".
71+
b[dst] = b[src]
72+
return dst + 1, src + 1
73+
}
74+
i++
75+
c := s[i]
76+
hex := false
77+
if c == 'x' || c == 'X' {
78+
hex = true
79+
i++
80+
}
81+
82+
x := '\x00'
83+
for i < len(s) {
84+
c = s[i]
85+
i++
86+
if hex {
87+
if '0' <= c && c <= '9' {
88+
x = 16*x + rune(c) - '0'
89+
continue
90+
} else if 'a' <= c && c <= 'f' {
91+
x = 16*x + rune(c) - 'a' + 10
92+
continue
93+
} else if 'A' <= c && c <= 'F' {
94+
x = 16*x + rune(c) - 'A' + 10
95+
continue
96+
}
97+
} else if '0' <= c && c <= '9' {
98+
x = 10*x + rune(c) - '0'
99+
continue
100+
}
101+
if c != ';' {
102+
i--
103+
}
104+
break
105+
}
106+
107+
if i <= 3 { // No characters matched.
108+
b[dst] = b[src]
109+
return dst + 1, src + 1
110+
}
111+
112+
if 0x80 <= x && x <= 0x9F {
113+
// Replace characters from Windows-1252 with UTF-8 equivalents.
114+
x = replacementTable[x-0x80]
115+
} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
116+
// Replace invalid characters with the replacement character.
117+
x = '\uFFFD'
118+
}
119+
120+
return dst + utf8.EncodeRune(b[dst:], x), src + i
121+
}
122+
123+
// Consume the maximum number of characters possible, with the
124+
// consumed characters matching one of the named references.
125+
126+
for i < len(s) {
127+
c := s[i]
128+
i++
129+
// Lower-cased characters are more common in entities, so we check for them first.
130+
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
131+
continue
132+
}
133+
if c != ';' {
134+
i--
135+
}
136+
break
137+
}
138+
139+
entityName := s[1:i]
140+
if len(entityName) == 0 {
141+
// No-op.
142+
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
143+
// No-op.
144+
} else if x := entity[string(entityName)]; x != 0 {
145+
return dst + utf8.EncodeRune(b[dst:], x), src + i
146+
} else if x := entity2[string(entityName)]; x[0] != 0 {
147+
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
148+
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
149+
} else if !attribute {
150+
maxLen := len(entityName) - 1
151+
if maxLen > longestEntityWithoutSemicolon {
152+
maxLen = longestEntityWithoutSemicolon
153+
}
154+
for j := maxLen; j > 1; j-- {
155+
if x := entity[string(entityName[:j])]; x != 0 {
156+
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
157+
}
158+
}
159+
}
160+
161+
dst1, src1 = dst+i, src+i
162+
copy(b[dst:dst1], b[src:src1])
163+
return dst1, src1
164+
}
165+
166+
var htmlEscaper = strings.NewReplacer(
167+
`&`, "&amp;",
168+
`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
169+
`<`, "&lt;",
170+
`>`, "&gt;",
171+
`"`, "&#34;", // "&#34;" is shorter than "&quot;".
172+
)
173+
174+
// EscapeString escapes special characters like "<" to become "&lt;". It
175+
// escapes only five such characters: <, >, &, ' and ".
176+
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
177+
// always true.
178+
func EscapeString(s string) string {
179+
return htmlEscaper.Replace(s)
180+
}
181+
182+
// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
183+
// larger range of entities than EscapeString escapes. For example, "&aacute;"
184+
// unescapes to "á", as does "&#225;" and "&#xE1;".
185+
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
186+
// always true.
187+
func UnescapeString(s string) string {
188+
if !populateMapsOnce {
189+
populateMaps()
190+
populateMapsOnce = true
191+
}
192+
i := strings.IndexByte(s, '&')
193+
194+
if i < 0 {
195+
return s
196+
}
197+
198+
b := []byte(s)
199+
dst, src := unescapeEntity(b, i, i)
200+
for len(s[src:]) > 0 {
201+
if s[src] == '&' {
202+
i = 0
203+
} else {
204+
i = strings.IndexByte(s[src:], '&')
205+
}
206+
if i < 0 {
207+
dst += copy(b[dst:], s[src:])
208+
break
209+
}
210+
211+
if i > 0 {
212+
copy(b[dst:], s[src:src+i])
213+
}
214+
dst, src = unescapeEntity(b, dst+i, src+i)
215+
}
216+
return string(b[:dst])
217+
}

0 commit comments

Comments
 (0)