lexer changes

mcy · mcy · commit 6673c095bbbd · 2024-12-09T13:53:58.000-08:00
diff --git a/experimental/parser/lex.go b/experimental/parser/lex.go
@@ -96,7 +96,9 @@ func Lex(ctx token.Context, errs *report.Report) {
 			} else {
 				text = l.SeekEOF()
 			}
+
 			l.Push(len("//")+len(text), token.Comment)
+
 		case r == '/' && l.Peek() == '*':
 			l.cursor++ // Skip the *.
 
@@ -116,7 +118,9 @@ func Lex(ctx token.Context, errs *report.Report) {
 				l.Error(ErrUnmatched{Span: l.SpanFrom(l.cursor - 2)})
 				text = l.SeekEOF()
 			}
+
 			l.Push(len("/*")+len(text), token.Comment)
+
 		case r == '*' && l.Peek() == '/':
 			l.cursor++ // Skip the /.
 
diff --git a/experimental/parser/lex_state.go b/experimental/parser/lex_state.go
@@ -15,6 +15,8 @@
 package parser
 
 import (
+	"fmt"
+	"slices"
 	"strings"
 	"unicode/utf8"
 
@@ -31,11 +33,12 @@ type lexer struct {
 	cursor, count int
 
 	braces []token.ID
-}
 
-func (l *lexer) Push(length int, kind token.Kind) token.Token {
-	l.count++
-	return l.Stream.Push(length, kind)
+	prev token.ID // The last non-skippable token.
+
+	firstCommentSincePrev  token.ID
+	firstCommentOnSameLine bool
+	parStart, parEnd       token.ID
 }
 
 func (l *lexer) Cursor() int {
@@ -107,6 +110,194 @@ func (l *lexer) SpanFrom(start int) report.Span {
 	return l.Span(start, l.cursor)
 }
 
+func (l *lexer) Push(length int, kind token.Kind) token.Token {
+	l.count++
+	prev := l.prev.In(l.Context)
+	tok := l.Stream.Push(length, kind)
+	// NOTE: tok will have the Stream rather than l.Context as its context,
+	// which will cause issues when we call NewCursor below.
+	tok = tok.ID().In(l.Context)
+
+	// NOTE: For the purposes of attributing comments, we need to know what line
+	// certain offsets are at. Although we could track this as we advance cursor,
+	// we instead use other methods to determine if two tokens are on the same
+	// line. This is for a couple of reasons.
+	//
+	// 1. Getting a line number from the line index is O(log n), but we can
+	//    instead use strings.Index and friends in some places without going
+	//    quadratic.
+	//
+	// 2. Having to examine every character directly locks us out of using e.g.
+	//    strings.Index for certain operations, which is much more efficient
+	//    than the naive for loop.
+
+	switch {
+	case tok.Kind() == token.Comment:
+		isLineComment := strings.HasPrefix(tok.Text(), "//")
+
+		if l.firstCommentSincePrev.Nil() {
+			l.firstCommentSincePrev = tok.ID()
+
+			if !prev.Nil() && l.newLinesBetween(prev, tok, 1) == 0 {
+				// The first comment is always in a paragraph by itself if there
+				// is no newline between it and the comment start.
+				l.firstCommentOnSameLine = true
+				break
+			}
+		}
+
+		if !isLineComment {
+			// Block comments cannot be made into paragraphs, so we must
+			// interrupt the current paragraph.
+			l.fuseParagraph()
+			break
+		}
+
+		// Start building up a line comment paragraph if there isn't one
+		// currently.
+		if l.parStart.Nil() {
+			l.parStart = tok.ID()
+		}
+		l.parEnd = tok.ID()
+
+	case tok.Kind() == token.Space:
+		// Note that line comments contain their newlines, except for a line
+		// comment at the end of the file. Thus, seeing a single new line
+		// means that if we are interrupting a line comment paragraph, and thus
+		// we must fuse the current paragraph.
+		if strings.Contains(tok.Text(), "\n") {
+			l.fuseParagraph()
+		}
+
+	default:
+		l.fuseParagraph()
+		//nolint:dupword // False positive due to comments describing an algorithm.
+		if !l.firstCommentSincePrev.Nil() {
+			fmt.Println(l.firstCommentSincePrev.In(l.Context), tok)
+			comments := token.NewCursor(l.firstCommentSincePrev.In(l.Context), tok)
+			var first, second, penultimate, last token.Token
+			for { // Don't use l.Done() here, that tosses comment tokens.
+				next := comments.PopSkippable()
+				if next.Nil() {
+					break
+				} else if next.Kind() == token.Comment {
+					switch {
+					case first.Nil():
+						first = next
+					case second.Nil():
+						second = next
+					}
+					penultimate = last
+					last = next
+				}
+			}
+			fmt.Println(first, second, penultimate, last)
+
+			// Determine if we need to donate first to the previous comment.
+			var donate bool
+			switch {
+			case prev.Nil():
+				donate = false
+			case l.firstCommentOnSameLine:
+				donate = true
+			case l.newLinesBetween(prev, first, 2) < 2:
+				// Now we need to check the remaining three criteria for
+				// donate. These are:
+				//
+				// 1. Is there more than one comment.
+				// 2. Is the token one of the closers ), ], or } (but not
+				//    >).
+				// 3. The line of the current token minus the end line of
+				//    the first comment is greater than one.
+				switch {
+				case !second.Nil():
+					donate = true
+				case slices.Contains([]string{")", "]", "}"}, tok.Text()):
+					donate = true
+				case l.newLinesBetween(first, tok, 2) > 1:
+					donate = true
+				}
+			}
+
+			if donate {
+				prev.Comments().SetTrailing(first)
+				first = second
+			}
+
+			// The leading comment must have precisely one newline between
+			// it and the new token.
+			if !first.Nil() && !last.Nil() && l.newLinesBetween(last, tok, 2) == 1 {
+				tok.Comments().SetLeading(last)
+				last = penultimate
+			}
+
+			// Check if we have any detached comments left. This is the case
+			// when first and last are both non-nil and <=. If we donated the
+			// only comment, second will have been nil, so first is now nil.
+			//
+			// If we attached the only remaining comment after donating a
+			// comment, we would have had the following value evolution for
+			// first, second, penultimate and last:
+			//
+			//   before donate: a, b, a, b
+			//   after donate: b, b, a, b
+			//   after attach: b, b, a, a
+			//
+			// Thus, when we check b < a, we find that we have nothing left to
+			// attach.
+			if !first.Nil() && !last.Nil() && first.ID() <= last.ID() {
+				tok.Comments().SetDetachedRange(first, last)
+			}
+
+			l.firstCommentSincePrev = 0
+			l.firstCommentOnSameLine = false
+		}
+
+		l.prev = tok.ID()
+	}
+	return tok
+}
+
+func (l *lexer) fuseParagraph() {
+	if !l.parStart.Nil() && l.parEnd != l.parStart {
+		token.Fuse(
+			l.parStart.In(l.Context),
+			l.parEnd.In(l.Context),
+		)
+	}
+	l.parStart = 0
+}
+
+// newLinesBetween counts the number of \n characters between the end of a
+// and the start of b, up to max.
+//
+// The final rune of a is included in this count, since comments may end in a
+// \n rune.
+//
+//nolint:revive,predeclared // Complains about redefining max.
+func (l *lexer) newLinesBetween(a, b token.Token, max int) int {
+	end := a.Span().End
+	if end != 0 {
+		// Account for the final rune of a.
+		end--
+	}
+
+	start := b.Span().Start
+	between := l.Text()[end:start]
+
+	var total int
+	for total < max {
+		var found bool
+		_, between, found = strings.Cut(between, "\n")
+		if !found {
+			break
+		}
+
+		total++
+	}
+	return total
+}
+
 // mustProgress returns a progress checker for this lexer.
 func (l *lexer) mustProgress() mustProgress {
 	return mustProgress{l, -1}
diff --git a/experimental/parser/lex_test.go b/experimental/parser/lex_test.go
@@ -26,6 +26,7 @@ import (
 	"github.com/bufbuild/protocompile/experimental/report"
 	"github.com/bufbuild/protocompile/experimental/token"
 	"github.com/bufbuild/protocompile/internal/golden"
+	"github.com/bufbuild/protocompile/internal/iters"
 )
 
 func TestRender(t *testing.T) {
@@ -87,6 +88,22 @@ func TestRender(t *testing.T) {
 				}
 			}
 
+			comments := tok.Comments()
+			iters.Enumerate(comments.Detached())(func(i int, t token.Token) bool {
+				if i == 0 {
+					fmt.Fprintf(&tsv, "\t\tdetached:%v", t.ID())
+				} else {
+					fmt.Fprintf(&tsv, ",%v", t.ID())
+				}
+				return true
+			})
+			if leading := comments.Leading(); !leading.Nil() {
+				fmt.Fprintf(&tsv, "\t\tleading:%v", leading.ID())
+			}
+			if trailing := comments.Trailing(); !trailing.Nil() {
+				fmt.Fprintf(&tsv, "\t\ttrailing:%v", trailing.ID())
+			}
+
 			tsv.WriteByte('\n')
 			return true
 		})
diff --git a/experimental/parser/testdata/lexer/comments/attribution.proto b/experimental/parser/testdata/lexer/comments/attribution.proto
@@ -0,0 +1,35 @@
+// This, as expected, is a leading comment for Foo.
+message Foo {
+  // This is the TRAILING comment for Foo. (It is NOT
+  // a detached comment for baz.)
+
+  // leading comment for baz
+  string baz = 1;
+  // trailing comment for baz
+}
+// This is NOT a trailing comment. It's also not considered
+// a detached comment for Bar. It is discarded.
+
+// This IS a detached comment for Bar.
+
+// A leading comment for Bar.
+message Bar {
+}
+
+string name = 1; // trailing comment for name
+// leading comment for id
+uint64 id = 2;
+
+previousToken // this comment
+// won't get merged into a
+// group with these two lines
+/* block comments */ /* are always their own groups */ // line comments
+// can usually get joined into
+// groups with adjacent lines
+
+   // empty lines separate groups
+// indentation does not impact grouping
+/* a single block
+ * comment can span lines
+ */
+currentToken
diff --git a/experimental/parser/testdata/lexer/comments/attribution.proto.tokens.tsv b/experimental/parser/testdata/lexer/comments/attribution.proto.tokens.tsv
@@ -0,0 +1,78 @@
+#		kind		offsets		linecol		text
+0		Comment		000:052		001:001		"// This, as expected, is a leading comment for Foo.\n"
+1		Ident		052:059		002:001		"message"		leading:Token(0)
+2		Space		059:060		002:008		" "
+3		Ident		060:063		002:009		"Foo"
+4		Space		063:064		002:012		" "
+5		Punct		064:233		002:013		"{"		close:Token(23)		trailing:Token(7)
+6		Space		065:068		002:014		"\n  "
+7		Comment		068:154		003:003		"// This is the TRAILING comment for Foo. (It is NOT\n"		close:Token(9)
+8		Space		120:122		004:001		"  "
+9		Comment		068:154		003:003		"// a detached comment for baz.)\n"		open:Token(7)
+10		Space		154:157		005:001		"\n  "
+11		Comment		157:184		006:003		"// leading comment for baz\n"
+12		Space		184:186		007:001		"  "
+13		Ident		186:192		007:003		"string"		leading:Token(11)
+14		Space		192:193		007:009		" "
+15		Ident		193:196		007:010		"baz"
+16		Space		196:197		007:013		" "
+17		Punct		197:198		007:014		"="
+18		Space		198:199		007:015		" "
+19		Number		199:200		007:016		"1"		int:1
+20		Punct		200:201		007:017		";"		trailing:Token(22)
+21		Space		201:204		007:018		"\n  "
+22		Comment		204:232		008:003		"// trailing comment for baz\n"
+23		Punct		064:233		002:013		"}"		open:Token(5)		trailing:Token(25)
+24		Space		233:234		009:002		"\n"
+25		Comment		234:342		010:001		"// This is NOT a trailing comment. It's also not considered\n"		close:Token(26)
+26		Comment		234:342		010:001		"// a detached comment for Bar. It is discarded.\n"		open:Token(25)
+27		Space		342:343		012:001		"\n"
+28		Comment		343:382		013:001		"// This IS a detached comment for Bar.\n"
+29		Space		382:383		014:001		"\n"
+30		Comment		383:413		015:001		"// A leading comment for Bar.\n"
+31		Ident		413:420		016:001		"message"		detached:Token(28)		leading:Token(30)
+32		Space		420:421		016:008		" "
+33		Ident		421:424		016:009		"Bar"
+34		Space		424:425		016:012		" "
+35		Punct		425:428		016:013		"{"		close:Token(37)
+36		Space		426:427		016:014		"\n"
+37		Punct		425:428		016:013		"}"		open:Token(35)
+38		Space		428:430		017:002		"\n\n"
+39		Ident		430:436		019:001		"string"
+40		Space		436:437		019:007		" "
+41		Ident		437:441		019:008		"name"
+42		Space		441:442		019:012		" "
+43		Punct		442:443		019:013		"="
+44		Space		443:444		019:014		" "
+45		Number		444:445		019:015		"1"		int:1
+46		Punct		445:446		019:016		";"		trailing:Token(48)
+47		Space		446:447		019:017		" "
+48		Comment		447:476		019:018		"// trailing comment for name\n"
+49		Comment		476:502		020:001		"// leading comment for id\n"
+50		Ident		502:508		021:001		"uint64"		leading:Token(49)
+51		Space		508:509		021:007		" "
+52		Ident		509:511		021:008		"id"
+53		Space		511:512		021:010		" "
+54		Punct		512:513		021:011		"="
+55		Space		513:514		021:012		" "
+56		Number		514:515		021:013		"2"		int:2
+57		Punct		515:516		021:014		";"
+58		Space		516:518		021:015		"\n\n"
+59		Ident		518:531		023:001		"previousToken"		trailing:Token(61)
+60		Space		531:532		023:014		" "
+61		Comment		532:548		023:015		"// this comment\n"
+62		Comment		548:605		024:001		"// won't get merged into a\n"		close:Token(63)
+63		Comment		548:605		024:001		"// group with these two lines\n"		open:Token(62)
+64		Comment		605:625		026:001		"/* block comments */"
+65		Space		625:626		026:021		" "
+66		Comment		626:659		026:022		"/* are always their own groups */"
+67		Space		659:660		026:055		" "
+68		Comment		660:738		026:056		"// line comments\n"		close:Token(70)
+69		Comment		677:708		027:001		"// can usually get joined into\n"
+70		Comment		660:738		026:056		"// groups with adjacent lines\n"		open:Token(68)
+71		Space		738:742		029:001		"\n   "
+72		Comment		742:813		030:004		"// empty lines separate groups\n"		close:Token(73)
+73		Comment		742:813		030:004		"// indentation does not impact grouping\n"		open:Token(72)
+74		Comment		813:860		032:001		"/* a single block\n * comment can span lines\n */"
+75		Space		860:861		034:004		"\n"
+76		Ident		861:873		035:001		"currentToken"		detached:Token(62),Token(64),Token(66),Token(68),Token(72)		leading:Token(74)
diff --git a/experimental/parser/testdata/lexer/comments/nested.proto.tokens.tsv b/experimental/parser/testdata/lexer/comments/nested.proto.tokens.tsv
@@ -1,4 +1,4 @@
 #		kind		offsets		linecol		text
 0		Comment		000:039		001:001		"/*\n    Nesting\n    /* is not allowed */"
 1		Space		039:040		003:025		"\n"
-2		Unrecognized		040:042		004:001		"*/"
+2		Unrecognized		040:042		004:001		"*/"		leading:Token(0)
diff --git a/experimental/parser/testdata/lexer/comments/ok.proto.tokens.tsv b/experimental/parser/testdata/lexer/comments/ok.proto.tokens.tsv
diff --git a/experimental/parser/testdata/lexer/smoke.proto.tokens.tsv b/experimental/parser/testdata/lexer/smoke.proto.tokens.tsv