-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtokenize.go
98 lines (86 loc) · 1.85 KB
/
tokenize.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// Copyright 2013 The gopp AUTHORS. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gopp
import (
"fmt"
"regexp"
)
type Token struct {
Type string
Raw string
Text string
Row, Col int
}
func (t Token) String() string {
return fmt.Sprintf("(%s: %q)", t.Type, t.Text)
}
type TokenizeInfo struct {
TokenREs []TypedRegexp
IgnoreREs []*regexp.Regexp
}
func Tokenize(ti TokenizeInfo, document []byte) (tokens []Token, err error) {
var row, col int
tokenloop:
for len(document) != 0 {
snippet := document
if len(snippet) > 20 {
snippet = snippet[:20]
}
// If something to ignore, trim it off.
for _, re := range ti.IgnoreREs {
matches := re.FindSubmatch(document)
if len(matches) == 0 {
continue
}
if string(document[:len(matches[0])]) != string(matches[0]) {
err = fmt.Errorf("Regexp matched text not at beginning: %s", re)
return
}
document = document[len(matches[0]):]
continue tokenloop
}
var newdocument []byte
for _, re := range ti.TokenREs {
matches := re.FindSubmatch(document)
if len(matches) == 0 {
continue
}
matchedText := matches[0]
capturedText := matches[1]
token := Token{
Type: re.Type,
Raw: string(matchedText),
Row: row,
Col: col,
}
if len(matches) > 1 {
token.Text = string(capturedText)
if err != nil {
return
}
}
for _, c := range matchedText {
if c == '\n' {
row++
col = 0
} else {
col++
}
}
newdocument = document[len(matchedText):]
tokens = append(tokens, token)
break
}
if newdocument == nil {
snippet := document
if len(snippet) > 80 {
snippet = snippet[:80]
}
err = fmt.Errorf("Could not match starting from %q.", snippet)
return
}
document = newdocument
}
return
}