-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscan.go
257 lines (241 loc) · 9.79 KB
/
scan.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
package nestext
import (
"fmt"
"io"
"strings"
)
// TODO: set new ScanLines function which will break on 'CR' without following 'LF' (see spec)
// We will be using two different scanners:
//
// - a line-level scanner, concerned with recognizing lines of input as tokens.
// These are to be consumed by a parser for the overall NestedText-grammar
// - an inline-scanner, concerned with recognizing words from inline-items' content.
// These will be used to parse inline-items like "{ key1:value1, key2:value2 }"
// --- Line level scanner ----------------------------------------------------
// scanner is a type for a line-level scanner.
//
// Our line-level scanner will operate by calling scanning steps in a chain, iteratively.
// Each step function tests for valid lookahead and then possibly branches out to a
// subsequent step function. Step functions may consume input characters ("match(…)").
//
type scanner struct {
Buf *lineBuffer // line buffer abstracts away properties of input readers
Step scannerStep // the next scanner step to execute in a chain
LastError error // last error, if any
}
// We're buiding up a scanner from chains of scanner step functions.
// Tokens may be modified by a step function.
// A scanner step will return the next step in the chain, or nil to stop/accept.
//
type scannerStep func(*parserToken) (*parserToken, scannerStep)
// newScanner creates a scanner for an input reader.
func newScanner(inputReader io.Reader) (*scanner, error) {
if inputReader == nil {
return nil, makeParsingError(nil, ErrCodeFormatNoInput, "no input present")
}
buf := newLineBuffer(inputReader)
sc := &scanner{Buf: buf}
sc.Step = sc.ScanFileStart
return sc, nil
}
// NextToken will be called by the parser to receive the next line-level token. A token
// subsumes the properties of a line of NestedText input (excluding inline-items such
// as "{ key:val, key:val }" ).
//
// NextToken ususally will iterate over a chain of step functions until it reaches an
// accepting state. Acceptance is signalled by getting a nil-step return value from a
// step function, meaning there is no further step applicable in this chain.
//
// If a step function returns an error-signalling token, the chaining stops as well.
//
func (sc *scanner) NextToken() *parserToken {
token := newParserToken(sc.Buf.CurrentLine, int(sc.Buf.Cursor))
if sc.Buf.IsEof() {
token.TokenType = eof
return token
}
if sc.Step == nil {
sc.Step = sc.ScanItem
}
for sc.Step != nil {
token, sc.Step = sc.Step(token)
if token.Error != nil {
sc.LastError = token.Error
sc.Buf.AdvanceLine()
break
}
if sc.Buf.Line.Size() == 0 {
//fmt.Printf("===> line empty\n")
break
}
}
//fmt.Printf("# new %s\n", token)
return token
}
// ScanFileStart matches a valid start of a NestedText document input. This is always the
// first step function to call.
//
// file start:
// -> EOF: emptyDocument
// -> other: docRoot
//
func (sc *scanner) ScanFileStart(token *parserToken) (*parserToken, scannerStep) {
token.TokenType = emptyDocument
if sc.Buf == nil {
token.Error = makeParsingError(token, ErrCodeFormatNoInput, "no valid input document")
return token, nil
}
if sc.Buf.IsEof() {
return token, nil
}
token.TokenType = docRoot
token.Indent = 0
if sc.Buf.Lookahead == ' ' {
// From the spec: There is no indentation on the top-level object.
token.Error = makeParsingError(token, ErrCodeFormatToplevelIndent, "top-level item must not be indented")
}
return token, nil
}
// StepItem is a step function to start recognizing a line-level item.
func (sc *scanner) ScanItem(token *parserToken) (*parserToken, scannerStep) {
//fmt.Println("---> ScanItem")
if sc.Buf.Lookahead == ' ' {
return token, sc.ScanIndentation
}
return token, sc.ScanItemBody
}
// ScanIndentation is a step function to recognize the indentation part of an item.
func (sc *scanner) ScanIndentation(token *parserToken) (*parserToken, scannerStep) {
if sc.Buf.Lookahead == ' ' {
sc.Buf.match(singleRune(' '))
token.Indent++
return token, sc.ScanIndentation
}
return token, sc.ScanItemBody
}
// ScanItemBody is a step function to recognize the main part of an item, starting at
// the item's tag (e.g., ':', '>', etc.). The only exception are inline keys and inline key-value-pairs,
// which start with the key's string.
//
func (sc *scanner) ScanItemBody(token *parserToken) (*parserToken, scannerStep) {
//fmt.Printf("---> ScanItemBody, LA = '%#U'\n", sc.Buf.Lookahead)
switch sc.Buf.Lookahead {
case '-': // list value, either single-line or multi-line. From the spec:
// If the first non-space character on a line is a dash followed immediately by a space (-␣) or
// a line break, the line is a list item.
sc.Buf.match(singleRune('-'))
switch sc.Buf.Lookahead {
case ' ', '\n': // yes, this is a valid list tag
return sc.recognizeItemTag('-', listItem, listItemMultiline, token), nil
default: // rare case: '-' as start of a dict key
return token, sc.ScanInlineKey
}
case '>': // multi-line string. From the spec:
// If the first non-space character on a line is a greater-than symbol followed immediately by
// a space (>␣) or a line break, the line is a string item.
sc.Buf.match(singleRune('>'))
switch sc.Buf.Lookahead {
case ' ', '\n': // yes, this is a valid string tag
return sc.recognizeItemTag('>', stringMultiline, stringMultiline, token), nil
default: // rare case: '>' as start of a dict key
return token, sc.ScanInlineKey
}
case ':': // multi-line key. From the spec:
// If the first non-space character on a line is a colon followed immediately by a space (:␣) or
// a line break, the line is a key item.
sc.Buf.match(singleRune(':'))
switch sc.Buf.Lookahead {
case ' ', '\n': // yes, this is a valid dict-key tag
return sc.recognizeItemTag(':', dictKeyMultiline, dictKeyMultiline, token), nil
default: // rare case: ':' as start of a dict-key
return token, sc.ScanInlineKey
}
case '[': // single-line list
return sc.recognizeInlineItem(inlineList, token), nil
case '{': // single-line dictionary
return sc.recognizeInlineItem(inlineDict, token), nil
default: // should be dictionary key
}
return token, sc.ScanInlineKey // 'epsilon-transition' to inline-key-value rules
}
// ScanInlineKey is a step function to recognize an inline key, optionally followed by an inline
// value.
func (sc *scanner) ScanInlineKey(token *parserToken) (*parserToken, scannerStep) {
switch sc.Buf.Lookahead { // consume characters; stop on ': ', ':\n' or EOL
case ':':
//fmt.Printf("@ LA = %#U, line = %q, at %d\n", sc.Buf.Lookahead, sc.Buf.Text, sc.Buf.Cursor)
sc.Buf.match(singleRune(':'))
//fmt.Printf("LA = %#U, line = %q, at %d\n", sc.Buf.Lookahead, sc.Buf.Text, sc.Buf.Cursor)
switch sc.Buf.Lookahead {
case ' ': // yes, this is a valid dict-key tag
//fmt.Printf("LA = %#U, line = %q, at %d\n", sc.Buf.Lookahead, sc.Buf.Text, sc.Buf.Cursor)
//fmt.Println("should fork --->")
// remove trailing whitespace from key (=> Content[0])
key := sc.Buf.Text[token.Indent : sc.Buf.ByteCursor-2]
token.Content = append(token.Content, strings.TrimSpace(key))
token = sc.recognizeItemTag(':', inlineDictKeyValue, inlineDictKey, token)
case eolMarker: // yes, this is a valid dict-key tag
// remove trailing whitespace from key (=> Content[0])
key := sc.Buf.Text[token.Indent : sc.Buf.ByteCursor-1]
token.Content = append(token.Content, strings.TrimSpace(key))
token = sc.recognizeItemTag(':', inlineDictKeyValue, inlineDictKey, token)
default: // rare case: ':' inside a dict key
//sc.Buf.match(anything())
//panic(fmt.Sprintf(":: found: LA=%#U\n", sc.Buf.Lookahead))
return token, sc.ScanInlineKey
}
case eolMarker: // Error: premature end of line
key := sc.Buf.Text[token.Indent : sc.Buf.ByteCursor-1]
token.Error = makeParsingError(token, ErrCodeFormatIllegalTag,
fmt.Sprintf("dict key item %q not properly terminated by ':'", key))
//fmt.Printf("LA = %#U, line = %q, at %d\n", sc.Buf.Lookahead, sc.Buf.Text, sc.Buf.Cursor)
default: // recognize everything as either part of the key or trailing whitespace
sc.Buf.match(anything())
return token, sc.ScanInlineKey
}
return token, nil
}
// recognizeItemTag continues after a valid item tag has been discovered. It will
// match the second character of the tag (either a space or a newline) and,
// depending on this character, select the continuation call.
//
func (sc *scanner) recognizeItemTag(tag rune, single, multi parserTokenType, token *parserToken) *parserToken {
//fmt.Printf("forked: LA = %#U, line = %q, at %d\n", sc.Buf.Lookahead, sc.Buf.Text, sc.Buf.Cursor)
// sc.Buf.match(singleRune(tag)) // changed: now already match by calling party
if sc.Buf.Lookahead != ' ' && sc.Buf.Lookahead != eolMarker {
token.Error = makeParsingError(token, ErrCodeFormatIllegalTag,
fmt.Sprintf("item tag %q followed by illegal character %#U", tag, sc.Buf.Lookahead))
return token
}
if sc.Buf.Lookahead == ' ' {
sc.Buf.match(singleRune(' '))
token.TokenType = single
token.Content = append(token.Content, sc.Buf.ReadLineRemainder())
return token
}
sc.Buf.match(singleRune(eolMarker))
token.TokenType = multi
return token
}
func (sc *scanner) recognizeInlineItem(toktype parserTokenType, token *parserToken) *parserToken {
trimmed := strings.TrimSpace(sc.Buf.Text)
closing := trimmed[len(trimmed)-1]
//closing := sc.Buf.Text[len(sc.Buf.Text)-1]
if !isMatchingBracket(sc.Buf.Lookahead, rune(closing)) {
token.Error = makeParsingError(token, ErrCodeFormatIllegalTag,
fmt.Sprintf("inline-item does not match opening tag: %#U vs %#U",
sc.Buf.Lookahead, rune(closing)))
}
token.TokenType = toktype
token.Content = append(token.Content, sc.Buf.ReadLineRemainder())
return token
}
func isMatchingBracket(open, close rune) bool {
if open == '[' {
return close == ']'
}
if open == '{' {
return close == '}'
}
return false
}