WIP: port incremental parse to CodeBlockItem

StevenWong12 · StevenWong12 · commit accde01cb9cf · 2023-06-26T22:15:54.000+08:00
diff --git a/Sources/SwiftParser/IncrementalParseTransition.swift b/Sources/SwiftParser/IncrementalParseTransition.swift
@@ -20,26 +20,41 @@
 /// This is also used for testing purposes to ensure incremental reparsing
 /// worked as expected.
 public protocol IncrementalParseReusedNodeDelegate {
-  /// Accepts the range and ``Syntax`` node of skipped source region.
+  /// Accepts ``Syntax`` node of skipped source region.
   ///
   /// - Parameters:
-  ///   - range: The source region of the currently parsed source.
   ///   - previousNode: The node from the previous tree that is associated with
   ///                   the skipped source region.
-  func parserReusedNode(range: ByteSourceRange, previousNode: Syntax)
+  func parserReusedNode(previousNode: Syntax)
 }
 
 /// An implementation of `IncrementalParseReusedNodeDelegate` that just collects
 /// the range and re-used node into an array.
 public final class IncrementalParseReusedNodeCollector:
   IncrementalParseReusedNodeDelegate
 {
-  public var rangeAndNodes: [(ByteSourceRange, Syntax)] = []
+  public var nodes: [Syntax] = []
 
   public init() {}
 
-  public func parserReusedNode(range: ByteSourceRange, previousNode: Syntax) {
-    rangeAndNodes.append((range, previousNode))
+  public func parserReusedNode(previousNode: Syntax) {
+    nodes.append(previousNode)
+  }
+}
+
+/// Record the affect range for potential re-used nodes. When edits intersect the affect range, the node is not able to be re-used.
+///
+/// This is also a trigger to enable parser to parse incrementally.
+public final class IncrementalParseNodeAffectRangeCollector {
+  /// A dict to record the utf8 length in source that might affect the parse of a node.
+  /// This information is used to determine whether a node can be reused
+  fileprivate var nodeAffectRange: [RawSyntax.ID: Int] = [:]
+
+  public init() {}
+
+  @_spi(RawSyntax)
+  public func registerNodeForIncrementalParse(node: RawSyntax, length: Int) {
+    self.nodeAffectRange[node.id] = length
   }
 }
 
@@ -71,13 +86,17 @@ public final class IncrementalParseTransition {
 /// updated source that was already parsed during a previous parse invocation.
 public struct IncrementalParseLookup {
   fileprivate let transition: IncrementalParseTransition
+
   fileprivate var cursor: SyntaxCursor
 
+  fileprivate let nodeAffectRangeCollector: IncrementalParseNodeAffectRangeCollector
+
   /// Create a new ``IncrementalParseLookup`` that can look nodes up based on the
   /// given ``IncrementalParseTransition``.
-  public init(transition: IncrementalParseTransition) {
+  public init(transition: IncrementalParseTransition, nodeAffectRangeCollector: IncrementalParseNodeAffectRangeCollector) {
     self.transition = transition
     self.cursor = .init(root: Syntax(transition.previousTree))
+    self.nodeAffectRangeCollector = nodeAffectRangeCollector
   }
 
   fileprivate var edits: ConcurrentEdits {
@@ -109,7 +128,6 @@ public struct IncrementalParseLookup {
     let node = cursorLookup(prevPosition: prevPosition, kind: kind)
     if let delegate = reusedDelegate, let node {
       delegate.parserReusedNode(
-        range: ByteSourceRange(offset: newOffset, length: node.byteSize),
         previousNode: node
       )
     }
@@ -148,24 +166,13 @@ public struct IncrementalParseLookup {
       return true
     }
 
-    // Node can also not be reused if an edit has been made in the next token's
-    // text, e.g. because `private struct Foo {}` parses as a CodeBlockItem with
-    // a StructDecl inside and `private struc Foo {}` parses as two
-    // CodeBlockItems one for `private` and one for `struc Foo {}`
-    var nextLeafNodeLength: SourceLength = .zero
-    if let nextSibling = cursor.nextSibling {
-      // Fast path check: if next sibling is before all the edits then we can
-      // re-use the node.
-      if !edits.edits.isEmpty && edits.edits.first!.range.offset > nextSibling.endPosition.utf8Offset {
-        return true
-      }
-      if let nextToken = nextSibling.firstToken(viewMode: .sourceAccurate) {
-        nextLeafNodeLength = nextToken.leadingTriviaLength + nextToken.contentLength
-      }
+    guard let nodeAffectRangeLength = nodeAffectRangeCollector.nodeAffectRange[node.raw.id] else {
+      return false
     }
+
     let nodeAffectRange = ByteSourceRange(
       offset: node.position.utf8Offset,
-      length: (node.totalLength + nextLeafNodeLength).utf8Length
+      length: nodeAffectRangeLength
     )
 
     for edit in edits.edits {
diff --git a/Sources/SwiftParser/Lexer/LexemeSequence.swift b/Sources/SwiftParser/Lexer/LexemeSequence.swift
@@ -32,24 +32,53 @@ extension Lexer {
     /// usually less than 0.1% of the memory allocated by the syntax arena.
     var lexerStateAllocator = BumpPtrAllocator(slabSize: 256)
 
+    /// Compute the offset of the end of next token
+    var offsetToNextTokenEnd: Int {
+      self.getOffsetToStart(self.nextToken) + self.nextToken.byteLength
+    }
+
+    /// See doc comments in ``LookaheadTracker``
+    var lookaheadTracker: UnsafeMutablePointer<LookaheadTracker>
+
     fileprivate init(sourceBufferStart: Lexer.Cursor, cursor: Lexer.Cursor) {
       self.sourceBufferStart = sourceBufferStart
       self.cursor = cursor
       self.nextToken = self.cursor.nextToken(sourceBufferStart: self.sourceBufferStart, stateAllocator: lexerStateAllocator)
+      self.lookaheadTracker = .allocate(capacity: 1)
+      self.lookaheadTracker.initialize(to: LookaheadTracker())
     }
 
     @_spi(Testing)
     public mutating func next() -> Lexer.Lexeme? {
       return self.advance()
     }
 
+    func recordFurthestOffset() {
+      self.lookaheadTracker.pointee.recordFurthestOffset(self.offsetToNextTokenEnd)
+    }
+
     mutating func advance() -> Lexer.Lexeme {
       defer {
         self.nextToken = self.cursor.nextToken(sourceBufferStart: self.sourceBufferStart, stateAllocator: lexerStateAllocator)
       }
       return self.nextToken
     }
 
+    /// Get the offset of `token` to `sourceBufferStart`.
+    func getOffsetToStart(_ token: Lexer.Lexeme) -> Int {
+      return self.sourceBufferStart.distance(to: token.cursor)
+    }
+
+    /// Advance the the cursor by `offset` and reset `currentToken`
+    mutating func advance(by offset: Int, currentToken: inout Lexer.Lexeme) {
+      self.cursor = currentToken.cursor
+      self.cursor.position = self.cursor.position.advanced(by: offset)
+
+      self.nextToken = self.cursor.nextToken(sourceBufferStart: self.sourceBufferStart, stateAllocator: lexerStateAllocator)
+
+      currentToken = self.advance()
+    }
+
     /// Reset the lexeme sequence to the state we were in when lexing `splitToken`
     /// but after we consumed `consumedPrefix` bytes from `splitToken`.
     /// - Warning: Do not add more usages of this function.
diff --git a/Sources/SwiftParser/Lookahead.swift b/Sources/SwiftParser/Lookahead.swift
@@ -33,6 +33,7 @@ extension Parser {
     ) {
       self.lexemes = lexemes
       self.currentToken = currentToken
+      self.lexemes.recordFurthestOffset()
     }
 
     fileprivate init(cloning other: Parser) {
@@ -90,6 +91,7 @@ extension Parser.Lookahead {
   mutating func consumeAnyToken() {
     tokensConsumed += 1
     self.currentToken = self.lexemes.advance()
+    self.lexemes.recordFurthestOffset()
   }
 
   mutating func consumeAnyToken(remapping: RawTokenKind) {
diff --git a/Sources/SwiftParser/Parser.swift b/Sources/SwiftParser/Parser.swift
@@ -101,6 +101,10 @@ public struct Parser {
   /// When this nesting level is exceeded, the parser should stop parsing.
   let maximumNestingLevel: Int
 
+  let parseLookup: IncrementalParseLookup?
+
+  let parseNodeAffectRange: IncrementalParseNodeAffectRangeCollector?
+
   /// A default maximum nesting level that is used if the client didn't
   /// explicitly specify one. Debug builds of the parser comume a lot more stack
   /// space and thus have a lower default maximum nesting level.
@@ -111,7 +115,12 @@ public struct Parser {
   #endif
 
   /// Initializes a ``Parser`` from the given string.
-  public init(_ input: String, maximumNestingLevel: Int? = nil) {
+  public init(
+    _ input: String,
+    maximumNestingLevel: Int? = nil,
+    parseNodeAffectRange: IncrementalParseNodeAffectRangeCollector? = nil,
+    parseTransition: IncrementalParseTransition? = nil
+  ) {
     self.maximumNestingLevel = maximumNestingLevel ?? Self.defaultMaximumNestingLevel
 
     self.arena = ParsingSyntaxArena(
@@ -126,6 +135,14 @@ public struct Parser {
 
     self.lexemes = Lexer.tokenize(interned)
     self.currentToken = self.lexemes.advance()
+    self.parseNodeAffectRange = parseNodeAffectRange
+    if let parseTransition,
+      let parseNodeAffectRange
+    {
+      self.parseLookup = IncrementalParseLookup(transition: parseTransition, nodeAffectRangeCollector: parseNodeAffectRange)
+    } else {
+      self.parseLookup = nil
+    }
   }
 
   /// Initializes a ``Parser`` from the given input buffer.
@@ -142,7 +159,13 @@ public struct Parser {
   ///            arena is created automatically, and `input` copied into the
   ///            arena. If non-`nil`, `input` must be within its registered
   ///            source buffer or allocator.
-  public init(_ input: UnsafeBufferPointer<UInt8>, maximumNestingLevel: Int? = nil, arena: ParsingSyntaxArena? = nil) {
+  public init(
+    _ input: UnsafeBufferPointer<UInt8>,
+    maximumNestingLevel: Int? = nil,
+    parseNodeAffectRange: IncrementalParseNodeAffectRangeCollector? = nil,
+    parseTransition: IncrementalParseTransition? = nil,
+    arena: ParsingSyntaxArena? = nil
+  ) {
     self.maximumNestingLevel = maximumNestingLevel ?? Self.defaultMaximumNestingLevel
 
     var sourceBuffer: UnsafeBufferPointer<UInt8>
@@ -159,6 +182,14 @@ public struct Parser {
 
     self.lexemes = Lexer.tokenize(sourceBuffer)
     self.currentToken = self.lexemes.advance()
+    self.parseNodeAffectRange = parseNodeAffectRange
+    if let parseTransition,
+      let parseNodeAffectRange
+    {
+      self.parseLookup = IncrementalParseLookup(transition: parseTransition, nodeAffectRangeCollector: parseNodeAffectRange)
+    } else {
+      self.parseLookup = nil
+    }
   }
 
   mutating func missingToken(_ kind: RawTokenKind, text: SyntaxText? = nil) -> RawTokenSyntax {
@@ -237,6 +268,7 @@ public struct Parser {
 extension Parser {
   /// Retrieves the token following the current token without consuming it.
   func peek() -> Lexer.Lexeme {
+    lexemes.recordFurthestOffset()
     return self.lexemes.peek()
   }
 }
@@ -629,3 +661,44 @@ extension Parser {
     )
   }
 }
+
+// MARK: Incremental Parsing
+extension Parser {
+  mutating func loadCurrentSyntaxNodeFromCache(for kind: SyntaxKind) -> Syntax? {
+    guard var parseLookup else {
+      return nil
+    }
+
+    let currentOffset = self.lexemes.getOffsetToStart(self.currentToken)
+    if let node = parseLookup.lookUp(currentOffset, kind: kind) {
+      self.lexemes.advance(by: node.byteSize, currentToken: &self.currentToken)
+      return node
+    }
+
+    return nil
+  }
+
+  func registerNodeForIncrementalParse(node: RawSyntax, startToken: Lexer.Lexeme) {
+    guard let parseNodeAffectRange else {
+      return
+    }
+    parseNodeAffectRange.registerNodeForIncrementalParse(
+      node: node,
+      length: max(lookaheadFurthestOffset - self.lexemes.getOffsetToStart(startToken), node.byteLength + currentToken.byteLength)
+    )
+  }
+
+  public var lookaheadFurthestOffset: Int {
+    return lexemes.lookaheadTracker.pointee.furthestOffset
+  }
+}
+
+/// Record the furthest offset to `sourceBufferStart` that is reached by  ``Parser.Peek()`` or ``Lookahead`` in ``Lexer/LexemeSequence``
+struct LookaheadTracker {
+  private(set) var furthestOffset: Int = 0
+
+  public mutating func recordFurthestOffset(_ furthestOffset: Int) {
+    /// We could lookahead multi-times to find different valid part of a node, so we should take the maximum of the lookahead offset as the possible affect range of a node.
+    self.furthestOffset = max(furthestOffset, self.furthestOffset)
+  }
+}
diff --git a/Sources/SwiftParser/TopLevel.swift b/Sources/SwiftParser/TopLevel.swift
@@ -151,6 +151,12 @@ extension Parser {
   ///     statement → compiler-control-statement
   ///     statements → statement statements?
   mutating func parseCodeBlockItem(isAtTopLevel: Bool, allowInitDecl: Bool) -> RawCodeBlockItemSyntax? {
+    let startToken = self.currentToken
+    if let syntax = self.loadCurrentSyntaxNodeFromCache(for: .codeBlockItem) {
+      self.registerNodeForIncrementalParse(node: syntax.raw, startToken: startToken)
+      return RawCodeBlockItemSyntax(syntax.raw)
+    }
+
     if let remainingTokens = remainingTokensIfMaximumNestingLevelReached() {
       return RawCodeBlockItemSyntax(
         remainingTokens,
@@ -183,12 +189,17 @@ extension Parser {
     if item.raw.isEmpty && semi == nil && trailingSemis.isEmpty {
       return nil
     }
-    return RawCodeBlockItemSyntax(
+
+    let result = RawCodeBlockItemSyntax(
       item: item,
       semicolon: semi,
       RawUnexpectedNodesSyntax(trailingSemis, arena: self.arena),
       arena: self.arena
     )
+
+    self.registerNodeForIncrementalParse(node: result.raw, startToken: startToken)
+
+    return result
   }
 
   private mutating func parseStatementItem() -> RawCodeBlockItemSyntax.Item {
diff --git a/Sources/SwiftParser/generated/Parser+Entry.swift b/Sources/SwiftParser/generated/Parser+Entry.swift
@@ -19,9 +19,10 @@ extension Parser {
   /// `Parser.init` for more details.
   public static func parse(
     source: String,
+    parseNodeAffectRange: IncrementalParseNodeAffectRangeCollector? = nil,
     parseTransition: IncrementalParseTransition? = nil
   ) -> SourceFileSyntax {
-    var parser = Parser(source)
+    var parser = Parser(source, parseNodeAffectRange: parseNodeAffectRange, parseTransition: parseTransition)
     return SourceFileSyntax.parse(from: &parser)
   }
   
@@ -30,9 +31,10 @@ extension Parser {
   public static func parse(
     source: UnsafeBufferPointer<UInt8>,
     maximumNestingLevel: Int? = nil,
+    parseNodeAffectRange: IncrementalParseNodeAffectRangeCollector? = nil,
     parseTransition: IncrementalParseTransition? = nil
   ) -> SourceFileSyntax {
-    var parser = Parser(source, maximumNestingLevel: maximumNestingLevel)
+    var parser = Parser(source, maximumNestingLevel: maximumNestingLevel, parseNodeAffectRange: parseNodeAffectRange, parseTransition: parseTransition)
     return SourceFileSyntax.parse(from: &parser)
   }
 }
diff --git a/Sources/SwiftSyntax/Raw/RawSyntax.swift b/Sources/SwiftSyntax/Raw/RawSyntax.swift
@@ -918,6 +918,20 @@ extension RawSyntax {
   }
 }
 
+extension RawSyntax: Identifiable {
+  public struct ID: Hashable {
+    /// The pointer to the start of the `RawSyntax` node.
+    private var pointer: UnsafeRawPointer
+    fileprivate init(_ raw: RawSyntax) {
+      self.pointer = UnsafeRawPointer(raw.pointer)
+    }
+  }
+
+  public var id: ID {
+    return ID(self)
+  }
+}
+
 #if DEBUG
 /// See `SyntaxMemoryLayout`.
 var RawSyntaxDataMemoryLayouts: [String: SyntaxMemoryLayout.Value] = [
diff --git a/Sources/_SwiftSyntaxTestSupport/IncrementalParseTestUtils.swift b/Sources/_SwiftSyntaxTestSupport/IncrementalParseTestUtils.swift
diff --git a/Sources/swift-parser-cli/Commands/PerformanceTest.swift b/Sources/swift-parser-cli/Commands/PerformanceTest.swift
diff --git a/Tests/SwiftParserTest/IncrementalParsingTests.swift b/Tests/SwiftParserTest/IncrementalParsingTests.swift

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@ extension Parser {`
`33`	`33`	`) {`
`34`	`34`	`self.lexemes = lexemes`
`35`	`35`	`self.currentToken = currentToken`
	`36`	`+ self.lexemes.recordFurthestOffset()`
`36`	`37`	`}`
`37`	`38`
`38`	`39`	`fileprivate init(cloning other: Parser) {`
`@@ -90,6 +91,7 @@ extension Parser.Lookahead {`
`90`	`91`	`mutating func consumeAnyToken() {`
`91`	`92`	`tokensConsumed += 1`
`92`	`93`	`self.currentToken = self.lexemes.advance()`
	`94`	`+ self.lexemes.recordFurthestOffset()`
`93`	`95`	`}`
`94`	`96`
`95`	`97`	`mutating func consumeAnyToken(remapping: RawTokenKind) {`