Skip to content

CDATA support for XML #15603

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions compiler/src/dotty/tools/dotc/config/ScalaSettings.scala
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,10 @@ private sealed trait XSettings:
}

val XmacroSettings: Setting[List[String]] = MultiStringSetting("-Xmacro-settings", "setting1,setting2,..settingN", "List of settings which exposed to the macros")

// XML parsing options
//"Convert PCData to Text and coalesce sibling nodes"
val Xxml = ChoiceSetting("-Xxml", "property", "Configure XML parsing.", List("coalescing"), "coalescing")
end XSettings

/** -Y "Forking" as in forked tongue or "Private" settings */
Expand Down
4 changes: 2 additions & 2 deletions compiler/src/dotty/tools/dotc/parsing/Parsers.scala
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ object Parsers {
*/
lazy val xmlp: xml.MarkupParsers.MarkupParser = {
myFirstXmlPos = source.atSpan(Span(in.offset))
new MarkupParser(this, true)
new MarkupParser(this, preserveWS = true, isCoalescing = ctx.settings.Xxml.value == "coalescing")
}

/** The position of the first XML literal encountered while parsing,
Expand All @@ -532,7 +532,7 @@ object Parsers {
def firstXmlPos: SourcePosition = myFirstXmlPos
private var myFirstXmlPos: SourcePosition = NoSourcePosition

object symbXMLBuilder extends xml.SymbolicXMLBuilder(this, true) // DEBUG choices
object symbXMLBuilder extends xml.SymbolicXMLBuilder(this, preserveWS = true, isCoalescing = ctx.settings.Xxml.value == "coalescing")

def xmlLiteral() : Tree = xmlp.xLiteral
def xmlLiteralPattern() : Tree = xmlp.xLiteralPattern
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ private[dotty] trait MarkupParserCommon {
val rest = until.tail

while (true) {
if (ch == head && peek(rest))
if ch == head && peek(rest) then
return handler(positioner(), sb.toString)
else if (ch == SU)
truncatedError("") // throws TruncatedXMLControl in compiler
Expand Down
191 changes: 105 additions & 86 deletions compiler/src/dotty/tools/dotc/parsing/xml/MarkupParsers.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package xml

import scala.language.unsafeNulls

import scala.annotation.tailrec
import scala.collection.mutable
import mutable.{ Buffer, ArrayBuffer, ListBuffer }
import scala.util.control.ControlThrowable
Expand All @@ -16,10 +17,12 @@ import Constants._
import util.SourceFile
import Utility._

import SymbolicXMLBuilder.*


// XXX/Note: many/most of the functions in here are almost direct cut and pastes
// from another file - scala.xml.parsing.MarkupParser, it looks like.
// (It was like that when I got here.) They used to be commented "[Duplicate]" butx
// (It was like that when I got here.) They used to be commented "[Duplicate]" but
// since approximately all of them were, I snipped it as noise. As far as I can
// tell this wasn't for any particularly good reason, but slightly different
// compiler and library parser interfaces meant it would take some setup.
Expand Down Expand Up @@ -49,7 +52,7 @@ object MarkupParsers {
override def getMessage: String = "input ended while parsing XML"
}

class MarkupParser(parser: Parser, final val preserveWS: Boolean)(implicit src: SourceFile) extends MarkupParserCommon {
class MarkupParser(parser: Parser, final val preserveWS: Boolean, isCoalescing: Boolean)(implicit src: SourceFile) extends MarkupParserCommon {

import Tokens.{ LBRACE, RBRACE }

Expand All @@ -68,17 +71,18 @@ object MarkupParsers {
if (ch == SU) throw TruncatedXMLControl
else reportSyntaxError(msg)

var input : CharArrayReader = _
var input: CharArrayReader = _
def lookahead(): BufferedIterator[Char] =
(input.buf drop input.charOffset).iterator.buffered

import parser.{ symbXMLBuilder => handle }

def curOffset : Int = input.charOffset - 1
var tmppos : Span = NoSpan
def curOffset: Int = input.lastCharOffset

var tmppos: Span = NoSpan
def ch: Char = input.ch
/** this method assign the next character to ch and advances in input */
def nextch(): Unit = { input.nextChar() }
def nextch(): Unit = input.nextChar()

protected def ch_returning_nextch: Char = {
val result = ch; input.nextChar(); result
Expand Down Expand Up @@ -181,22 +185,17 @@ object MarkupParsers {
xTakeUntil(handle.comment, () => Span(start, curOffset, start), "-->")
}

def appendText(span: Span, ts: Buffer[Tree], txt: String): Unit = {
def append(t: String) = ts append handle.text(span, t)

if (preserveWS) append(txt)
else {
val sb = new StringBuilder()

txt foreach { c =>
if (!isSpace(c)) sb append c
else if (sb.isEmpty || !isSpace(sb.last)) sb append ' '
}

val trimmed = sb.toString.trim
if (!trimmed.isEmpty) append(trimmed)
}
}
def appendText(span: Span, ts: Buffer[Tree], txt: String): Unit =
val clean =
if preserveWS then txt
else
val sb = StringBuilder()
txt foreach { c =>
if !isSpace(c) then sb += c
else if sb.isEmpty || !isSpace(sb.last) then sb += ' '
}
sb.toString.trim
if !clean.isEmpty then ts += handle.text(span, clean)

/** adds entity/character to ts as side-effect
* @precond ch == '&'
Expand Down Expand Up @@ -226,48 +225,74 @@ object MarkupParsers {
if (xCheckEmbeddedBlock) ts append xEmbeddedExpr
else appendText(p, ts, xText)

/** Returns true if it encounters an end tag (without consuming it),
* appends trees to ts as side-effect.
*
* @param ts ...
* @return ...
/** At an open angle-bracket, detects an end tag
* or consumes CDATA, comment, PI or element.
* Trees are appended to `ts` as a side-effect.
* @return true if an end tag (without consuming it)
*/
private def content_LT(ts: ArrayBuffer[Tree]): Boolean = {
if (ch == '/')
return true // end tag

val toAppend = ch match {
case '!' => nextch() ; if (ch =='[') xCharData else xComment // CDATA or Comment
case '?' => nextch() ; xProcInstr // PI
case _ => element // child node
private def content_LT(ts: ArrayBuffer[Tree]): Boolean =
(ch == '/') || {
val toAppend = ch match
case '!' => nextch() ; if (ch =='[') xCharData else xComment // CDATA or Comment
case '?' => nextch() ; xProcInstr // PI
case _ => element // child node
ts += toAppend
false
}

ts append toAppend
false
}

def content: Buffer[Tree] = {
def content: Buffer[Tree] =
val ts = new ArrayBuffer[Tree]
while (true) {
if (xEmbeddedBlock)
ts append xEmbeddedExpr
else {
@tailrec def loopContent(): Unit =
if xEmbeddedBlock then
ts += xEmbeddedExpr
loopContent()
else
tmppos = Span(curOffset)
ch match {
// end tag, cdata, comment, pi or child node
case '<' => nextch() ; if (content_LT(ts)) return ts
// either the character '{' or an embedded scala block }
case '{' => content_BRACE(tmppos, ts) // }
// EntityRef or CharRef
case '&' => content_AMP(ts)
case SU => return ts
// text content - here xEmbeddedBlock might be true
case _ => appendText(tmppos, ts, xText)
}
}
}
unreachable
}
ch match
case '<' => // end tag, cdata, comment, pi or child node
nextch()
if !content_LT(ts) then loopContent()
case '{' => // literal brace or embedded Scala block
content_BRACE(tmppos, ts)
loopContent()
case '&' => // EntityRef or CharRef
content_AMP(ts)
loopContent()
case SU => ()
case _ => // text content - here xEmbeddedBlock might be true
appendText(tmppos, ts, xText)
loopContent()
end if
// merge text sections and strip attachments
def coalesce(): ArrayBuffer[Tree] =
def copy() =
val buf = ArrayBuffer.empty[Tree]
val acc = StringBuilder()
var pos: PositionType = NoSpan
def emit() =
if acc.nonEmpty then
appendText(pos, buf, acc.toString)
acc.clear()
for t <- ts do
t.getAttachment(TextAttacheKey) match {
case Some(ta) =>
if acc.isEmpty then pos = ta.span
acc append ta.text
case _ =>
emit()
buf += t
}
emit()
buf
end copy
// begin
val res = if ts.count(_.hasAttachment(TextAttacheKey)) > 1 then copy() else ts
for t <- res do t.removeAttachment(TextAttacheKey)
res
end coalesce
loopContent()
if isCoalescing then coalesce() else ts
end content

/** '<' element ::= xmlTag1 '>' { xmlExpr | '{' simpleExpr '}' } ETag
* | xmlTag1 '/' '>'
Expand Down Expand Up @@ -299,24 +324,19 @@ object MarkupParsers {
/** parse character data.
* precondition: xEmbeddedBlock == false (we are not in a scala block)
*/
private def xText: String = {
private def xText: String =
assert(!xEmbeddedBlock, "internal error: encountered embedded block")
val buf = new StringBuilder
def done = buf.toString

while (ch != SU) {
if (ch == '}') {
if (charComingAfter(nextch()) == '}') nextch()
else errorBraces()
}

buf append ch
nextch()
if (xCheckEmbeddedBlock || ch == '<' || ch == '&')
return done
}
done
}
val buf = StringBuilder()
if (ch != SU)
while
if ch == '}' then
if charComingAfter(nextch()) == '}' then nextch()
else errorBraces()
buf += ch
nextch()
!(ch == SU || xCheckEmbeddedBlock || ch == '<' || ch == '&')
do ()
buf.toString

/** Some try/catch/finally logic used by xLiteral and xLiteralPattern. */
inline private def xLiteralCommon(f: () => Tree, ifTruncated: String => Unit): Tree = {
Expand All @@ -329,9 +349,9 @@ object MarkupParsers {
case c @ TruncatedXMLControl =>
ifTruncated(c.getMessage)
case c @ (MissingEndTagControl | ConfusedAboutBracesControl) =>
parser.syntaxError(c.getMessage + debugLastElem + ">", debugLastPos)
parser.syntaxError(s"${c.getMessage}$debugLastElem>", debugLastPos)
case _: ArrayIndexOutOfBoundsException =>
parser.syntaxError("missing end tag in XML literal for <%s>" format debugLastElem, debugLastPos)
parser.syntaxError(s"missing end tag in XML literal for <$debugLastElem>", debugLastPos)
}
finally parser.in.resume(saved)

Expand All @@ -342,14 +362,13 @@ object MarkupParsers {
}

/** Use a lookahead parser to run speculative body, and return the first char afterward. */
private def charComingAfter(body: => Unit): Char = {
private def charComingAfter(body: => Unit): Char =
try {
input = input.lookaheadReader()
body
ch
}
finally input = parser.in
}

/** xLiteral = element { element }
* @return Scala representation of this xml literal
Expand All @@ -369,7 +388,7 @@ object MarkupParsers {
while {
xSpaceOpt()
nextch()
ts.append(element)
content_LT(ts)
charComingAfter(xSpaceOpt()) == '<'
} do ()
handle.makeXMLseq(Span(start, curOffset, start), ts)
Expand Down Expand Up @@ -431,7 +450,7 @@ object MarkupParsers {
* | Name [S] '/' '>'
*/
def xPattern: Tree = {
var start = curOffset
val start = curOffset
val qname = xName
debugLastStartElement = (start, qname) :: debugLastStartElement
xSpaceOpt()
Expand All @@ -453,11 +472,11 @@ object MarkupParsers {
if (ch != '/') ts append xPattern // child
else return false // terminate

case '{' => // embedded Scala patterns
while (ch == '{') {
nextch()
case '{' if xCheckEmbeddedBlock => // embedded Scala patterns, if not double brace
while
ts ++= xScalaPatterns
}
xCheckEmbeddedBlock
do ()
assert(!xEmbeddedBlock, "problem with embedded block")

case SU =>
Expand Down
14 changes: 11 additions & 3 deletions compiler/src/dotty/tools/dotc/parsing/xml/SymbolicXMLBuilder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import Flags.Mutable
import Names._, StdNames._, ast.Trees._, ast.{tpd, untpd}
import Symbols._, Contexts._
import util.Spans._
import util.Property
import Parsers.Parser

/** This class builds instance of `Tree` that represent XML.
Expand All @@ -25,12 +26,13 @@ import Parsers.Parser
* @author Burak Emir
* @version 1.0
*/
class SymbolicXMLBuilder(parser: Parser, preserveWS: Boolean)(using Context) {
class SymbolicXMLBuilder(parser: Parser, preserveWS: Boolean, isCoalescing: Boolean)(using Context) {

import Constants.Constant
import untpd._

import parser.atSpan
import SymbolicXMLBuilder.*

private[parsing] var isPattern: Boolean = _

Expand Down Expand Up @@ -115,8 +117,9 @@ class SymbolicXMLBuilder(parser: Parser, preserveWS: Boolean)(using Context) {

// create scala.xml.Text here <: scala.xml.Node
final def text(span: Span, txt: String): Tree = atSpan(span) {
if (isPattern) makeTextPat(const(txt))
else makeText1(const(txt))
val t = if isPattern then makeTextPat(const(txt)) else makeText1(const(txt))
if isCoalescing then t.putAttachment(TextAttacheKey, TextAttache(span, txt))
t
}

def makeTextPat(txt: Tree): Apply = Apply(_scala_xml__Text, List(txt))
Expand Down Expand Up @@ -259,3 +262,8 @@ class SymbolicXMLBuilder(parser: Parser, preserveWS: Boolean)(using Context) {
atSpan(span.toSynthetic)(new XMLBlock(nsResult, new XMLBlock(attrResult, body)))
}
}
object SymbolicXMLBuilder:
val TextAttacheKey: Property.Key[TextAttache] = Property.Key[TextAttache]()
/** Attachment for trees deriving from text nodes (Text, CData, entities). Used for coalescing. */
case class TextAttache(span: Span, text: String)
end SymbolicXMLBuilder
7 changes: 7 additions & 0 deletions tests/neg/t2275a.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
object Test {
if (true) {
<br> // error maybe this tag isn't closed // error
}else{ // error // error in XML content, use double brace
<span>{"louenesee"}</span>
}
} // anypos-error
7 changes: 0 additions & 7 deletions tests/untried/neg/t2275a.scala

This file was deleted.