Skip to content

Commit 6673c09

Browse files
committed
lexer changes
1 parent f841375 commit 6673c09

File tree

8 files changed

+336
-11
lines changed

8 files changed

+336
-11
lines changed

experimental/parser/lex.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,9 @@ func Lex(ctx token.Context, errs *report.Report) {
9696
} else {
9797
text = l.SeekEOF()
9898
}
99+
99100
l.Push(len("//")+len(text), token.Comment)
101+
100102
case r == '/' && l.Peek() == '*':
101103
l.cursor++ // Skip the *.
102104

@@ -116,7 +118,9 @@ func Lex(ctx token.Context, errs *report.Report) {
116118
l.Error(ErrUnmatched{Span: l.SpanFrom(l.cursor - 2)})
117119
text = l.SeekEOF()
118120
}
121+
119122
l.Push(len("/*")+len(text), token.Comment)
123+
120124
case r == '*' && l.Peek() == '/':
121125
l.cursor++ // Skip the /.
122126

experimental/parser/lex_state.go

Lines changed: 195 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
package parser
1616

1717
import (
18+
"fmt"
19+
"slices"
1820
"strings"
1921
"unicode/utf8"
2022

@@ -31,11 +33,12 @@ type lexer struct {
3133
cursor, count int
3234

3335
braces []token.ID
34-
}
3536

36-
func (l *lexer) Push(length int, kind token.Kind) token.Token {
37-
l.count++
38-
return l.Stream.Push(length, kind)
37+
prev token.ID // The last non-skippable token.
38+
39+
firstCommentSincePrev token.ID
40+
firstCommentOnSameLine bool
41+
parStart, parEnd token.ID
3942
}
4043

4144
func (l *lexer) Cursor() int {
@@ -107,6 +110,194 @@ func (l *lexer) SpanFrom(start int) report.Span {
107110
return l.Span(start, l.cursor)
108111
}
109112

113+
func (l *lexer) Push(length int, kind token.Kind) token.Token {
114+
l.count++
115+
prev := l.prev.In(l.Context)
116+
tok := l.Stream.Push(length, kind)
117+
// NOTE: tok will have the Stream rather than l.Context as its context,
118+
// which will cause issues when we call NewCursor below.
119+
tok = tok.ID().In(l.Context)
120+
121+
// NOTE: For the purposes of attributing comments, we need to know what line
122+
// certain offsets are at. Although we could track this as we advance cursor,
123+
// we instead use other methods to determine if two tokens are on the same
124+
// line. This is for a couple of reasons.
125+
//
126+
// 1. Getting a line number from the line index is O(log n), but we can
127+
// instead use strings.Index and friends in some places without going
128+
// quadratic.
129+
//
130+
// 2. Having to examine every character directly locks us out of using e.g.
131+
// strings.Index for certain operations, which is much more efficient
132+
// than the naive for loop.
133+
134+
switch {
135+
case tok.Kind() == token.Comment:
136+
isLineComment := strings.HasPrefix(tok.Text(), "//")
137+
138+
if l.firstCommentSincePrev.Nil() {
139+
l.firstCommentSincePrev = tok.ID()
140+
141+
if !prev.Nil() && l.newLinesBetween(prev, tok, 1) == 0 {
142+
// The first comment is always in a paragraph by itself if there
143+
// is no newline between it and the comment start.
144+
l.firstCommentOnSameLine = true
145+
break
146+
}
147+
}
148+
149+
if !isLineComment {
150+
// Block comments cannot be made into paragraphs, so we must
151+
// interrupt the current paragraph.
152+
l.fuseParagraph()
153+
break
154+
}
155+
156+
// Start building up a line comment paragraph if there isn't one
157+
// currently.
158+
if l.parStart.Nil() {
159+
l.parStart = tok.ID()
160+
}
161+
l.parEnd = tok.ID()
162+
163+
case tok.Kind() == token.Space:
164+
// Note that line comments contain their newlines, except for a line
165+
// comment at the end of the file. Thus, seeing a single new line
166+
// means that if we are interrupting a line comment paragraph, and thus
167+
// we must fuse the current paragraph.
168+
if strings.Contains(tok.Text(), "\n") {
169+
l.fuseParagraph()
170+
}
171+
172+
default:
173+
l.fuseParagraph()
174+
//nolint:dupword // False positive due to comments describing an algorithm.
175+
if !l.firstCommentSincePrev.Nil() {
176+
fmt.Println(l.firstCommentSincePrev.In(l.Context), tok)
177+
comments := token.NewCursor(l.firstCommentSincePrev.In(l.Context), tok)
178+
var first, second, penultimate, last token.Token
179+
for { // Don't use l.Done() here, that tosses comment tokens.
180+
next := comments.PopSkippable()
181+
if next.Nil() {
182+
break
183+
} else if next.Kind() == token.Comment {
184+
switch {
185+
case first.Nil():
186+
first = next
187+
case second.Nil():
188+
second = next
189+
}
190+
penultimate = last
191+
last = next
192+
}
193+
}
194+
fmt.Println(first, second, penultimate, last)
195+
196+
// Determine if we need to donate first to the previous comment.
197+
var donate bool
198+
switch {
199+
case prev.Nil():
200+
donate = false
201+
case l.firstCommentOnSameLine:
202+
donate = true
203+
case l.newLinesBetween(prev, first, 2) < 2:
204+
// Now we need to check the remaining three criteria for
205+
// donate. These are:
206+
//
207+
// 1. Is there more than one comment.
208+
// 2. Is the token one of the closers ), ], or } (but not
209+
// >).
210+
// 3. The line of the current token minus the end line of
211+
// the first comment is greater than one.
212+
switch {
213+
case !second.Nil():
214+
donate = true
215+
case slices.Contains([]string{")", "]", "}"}, tok.Text()):
216+
donate = true
217+
case l.newLinesBetween(first, tok, 2) > 1:
218+
donate = true
219+
}
220+
}
221+
222+
if donate {
223+
prev.Comments().SetTrailing(first)
224+
first = second
225+
}
226+
227+
// The leading comment must have precisely one newline between
228+
// it and the new token.
229+
if !first.Nil() && !last.Nil() && l.newLinesBetween(last, tok, 2) == 1 {
230+
tok.Comments().SetLeading(last)
231+
last = penultimate
232+
}
233+
234+
// Check if we have any detached comments left. This is the case
235+
// when first and last are both non-nil and <=. If we donated the
236+
// only comment, second will have been nil, so first is now nil.
237+
//
238+
// If we attached the only remaining comment after donating a
239+
// comment, we would have had the following value evolution for
240+
// first, second, penultimate and last:
241+
//
242+
// before donate: a, b, a, b
243+
// after donate: b, b, a, b
244+
// after attach: b, b, a, a
245+
//
246+
// Thus, when we check b < a, we find that we have nothing left to
247+
// attach.
248+
if !first.Nil() && !last.Nil() && first.ID() <= last.ID() {
249+
tok.Comments().SetDetachedRange(first, last)
250+
}
251+
252+
l.firstCommentSincePrev = 0
253+
l.firstCommentOnSameLine = false
254+
}
255+
256+
l.prev = tok.ID()
257+
}
258+
return tok
259+
}
260+
261+
func (l *lexer) fuseParagraph() {
262+
if !l.parStart.Nil() && l.parEnd != l.parStart {
263+
token.Fuse(
264+
l.parStart.In(l.Context),
265+
l.parEnd.In(l.Context),
266+
)
267+
}
268+
l.parStart = 0
269+
}
270+
271+
// newLinesBetween counts the number of \n characters between the end of a
272+
// and the start of b, up to max.
273+
//
274+
// The final rune of a is included in this count, since comments may end in a
275+
// \n rune.
276+
//
277+
//nolint:revive,predeclared // Complains about redefining max.
278+
func (l *lexer) newLinesBetween(a, b token.Token, max int) int {
279+
end := a.Span().End
280+
if end != 0 {
281+
// Account for the final rune of a.
282+
end--
283+
}
284+
285+
start := b.Span().Start
286+
between := l.Text()[end:start]
287+
288+
var total int
289+
for total < max {
290+
var found bool
291+
_, between, found = strings.Cut(between, "\n")
292+
if !found {
293+
break
294+
}
295+
296+
total++
297+
}
298+
return total
299+
}
300+
110301
// mustProgress returns a progress checker for this lexer.
111302
func (l *lexer) mustProgress() mustProgress {
112303
return mustProgress{l, -1}

experimental/parser/lex_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/bufbuild/protocompile/experimental/report"
2727
"github.com/bufbuild/protocompile/experimental/token"
2828
"github.com/bufbuild/protocompile/internal/golden"
29+
"github.com/bufbuild/protocompile/internal/iters"
2930
)
3031

3132
func TestRender(t *testing.T) {
@@ -87,6 +88,22 @@ func TestRender(t *testing.T) {
8788
}
8889
}
8990

91+
comments := tok.Comments()
92+
iters.Enumerate(comments.Detached())(func(i int, t token.Token) bool {
93+
if i == 0 {
94+
fmt.Fprintf(&tsv, "\t\tdetached:%v", t.ID())
95+
} else {
96+
fmt.Fprintf(&tsv, ",%v", t.ID())
97+
}
98+
return true
99+
})
100+
if leading := comments.Leading(); !leading.Nil() {
101+
fmt.Fprintf(&tsv, "\t\tleading:%v", leading.ID())
102+
}
103+
if trailing := comments.Trailing(); !trailing.Nil() {
104+
fmt.Fprintf(&tsv, "\t\ttrailing:%v", trailing.ID())
105+
}
106+
90107
tsv.WriteByte('\n')
91108
return true
92109
})
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// This, as expected, is a leading comment for Foo.
2+
message Foo {
3+
// This is the TRAILING comment for Foo. (It is NOT
4+
// a detached comment for baz.)
5+
6+
// leading comment for baz
7+
string baz = 1;
8+
// trailing comment for baz
9+
}
10+
// This is NOT a trailing comment. It's also not considered
11+
// a detached comment for Bar. It is discarded.
12+
13+
// This IS a detached comment for Bar.
14+
15+
// A leading comment for Bar.
16+
message Bar {
17+
}
18+
19+
string name = 1; // trailing comment for name
20+
// leading comment for id
21+
uint64 id = 2;
22+
23+
previousToken // this comment
24+
// won't get merged into a
25+
// group with these two lines
26+
/* block comments */ /* are always their own groups */ // line comments
27+
// can usually get joined into
28+
// groups with adjacent lines
29+
30+
// empty lines separate groups
31+
// indentation does not impact grouping
32+
/* a single block
33+
* comment can span lines
34+
*/
35+
currentToken
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# kind offsets linecol text
2+
0 Comment 000:052 001:001 "// This, as expected, is a leading comment for Foo.\n"
3+
1 Ident 052:059 002:001 "message" leading:Token(0)
4+
2 Space 059:060 002:008 " "
5+
3 Ident 060:063 002:009 "Foo"
6+
4 Space 063:064 002:012 " "
7+
5 Punct 064:233 002:013 "{" close:Token(23) trailing:Token(7)
8+
6 Space 065:068 002:014 "\n "
9+
7 Comment 068:154 003:003 "// This is the TRAILING comment for Foo. (It is NOT\n" close:Token(9)
10+
8 Space 120:122 004:001 " "
11+
9 Comment 068:154 003:003 "// a detached comment for baz.)\n" open:Token(7)
12+
10 Space 154:157 005:001 "\n "
13+
11 Comment 157:184 006:003 "// leading comment for baz\n"
14+
12 Space 184:186 007:001 " "
15+
13 Ident 186:192 007:003 "string" leading:Token(11)
16+
14 Space 192:193 007:009 " "
17+
15 Ident 193:196 007:010 "baz"
18+
16 Space 196:197 007:013 " "
19+
17 Punct 197:198 007:014 "="
20+
18 Space 198:199 007:015 " "
21+
19 Number 199:200 007:016 "1" int:1
22+
20 Punct 200:201 007:017 ";" trailing:Token(22)
23+
21 Space 201:204 007:018 "\n "
24+
22 Comment 204:232 008:003 "// trailing comment for baz\n"
25+
23 Punct 064:233 002:013 "}" open:Token(5) trailing:Token(25)
26+
24 Space 233:234 009:002 "\n"
27+
25 Comment 234:342 010:001 "// This is NOT a trailing comment. It's also not considered\n" close:Token(26)
28+
26 Comment 234:342 010:001 "// a detached comment for Bar. It is discarded.\n" open:Token(25)
29+
27 Space 342:343 012:001 "\n"
30+
28 Comment 343:382 013:001 "// This IS a detached comment for Bar.\n"
31+
29 Space 382:383 014:001 "\n"
32+
30 Comment 383:413 015:001 "// A leading comment for Bar.\n"
33+
31 Ident 413:420 016:001 "message" detached:Token(28) leading:Token(30)
34+
32 Space 420:421 016:008 " "
35+
33 Ident 421:424 016:009 "Bar"
36+
34 Space 424:425 016:012 " "
37+
35 Punct 425:428 016:013 "{" close:Token(37)
38+
36 Space 426:427 016:014 "\n"
39+
37 Punct 425:428 016:013 "}" open:Token(35)
40+
38 Space 428:430 017:002 "\n\n"
41+
39 Ident 430:436 019:001 "string"
42+
40 Space 436:437 019:007 " "
43+
41 Ident 437:441 019:008 "name"
44+
42 Space 441:442 019:012 " "
45+
43 Punct 442:443 019:013 "="
46+
44 Space 443:444 019:014 " "
47+
45 Number 444:445 019:015 "1" int:1
48+
46 Punct 445:446 019:016 ";" trailing:Token(48)
49+
47 Space 446:447 019:017 " "
50+
48 Comment 447:476 019:018 "// trailing comment for name\n"
51+
49 Comment 476:502 020:001 "// leading comment for id\n"
52+
50 Ident 502:508 021:001 "uint64" leading:Token(49)
53+
51 Space 508:509 021:007 " "
54+
52 Ident 509:511 021:008 "id"
55+
53 Space 511:512 021:010 " "
56+
54 Punct 512:513 021:011 "="
57+
55 Space 513:514 021:012 " "
58+
56 Number 514:515 021:013 "2" int:2
59+
57 Punct 515:516 021:014 ";"
60+
58 Space 516:518 021:015 "\n\n"
61+
59 Ident 518:531 023:001 "previousToken" trailing:Token(61)
62+
60 Space 531:532 023:014 " "
63+
61 Comment 532:548 023:015 "// this comment\n"
64+
62 Comment 548:605 024:001 "// won't get merged into a\n" close:Token(63)
65+
63 Comment 548:605 024:001 "// group with these two lines\n" open:Token(62)
66+
64 Comment 605:625 026:001 "/* block comments */"
67+
65 Space 625:626 026:021 " "
68+
66 Comment 626:659 026:022 "/* are always their own groups */"
69+
67 Space 659:660 026:055 " "
70+
68 Comment 660:738 026:056 "// line comments\n" close:Token(70)
71+
69 Comment 677:708 027:001 "// can usually get joined into\n"
72+
70 Comment 660:738 026:056 "// groups with adjacent lines\n" open:Token(68)
73+
71 Space 738:742 029:001 "\n "
74+
72 Comment 742:813 030:004 "// empty lines separate groups\n" close:Token(73)
75+
73 Comment 742:813 030:004 "// indentation does not impact grouping\n" open:Token(72)
76+
74 Comment 813:860 032:001 "/* a single block\n * comment can span lines\n */"
77+
75 Space 860:861 034:004 "\n"
78+
76 Ident 861:873 035:001 "currentToken" detached:Token(62),Token(64),Token(66),Token(68),Token(72) leading:Token(74)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# kind offsets linecol text
22
0 Comment 000:039 001:001 "/*\n Nesting\n /* is not allowed */"
33
1 Space 039:040 003:025 "\n"
4-
2 Unrecognized 040:042 004:001 "*/"
4+
2 Unrecognized 040:042 004:001 "*/" leading:Token(0)

0 commit comments

Comments
 (0)