diff --git a/html5ever-2015-05-15/Cargo.toml b/html5ever-2015-05-15/Cargo.toml deleted file mode 100644 index d5ddd9f51..000000000 --- a/html5ever-2015-05-15/Cargo.toml +++ /dev/null @@ -1,40 +0,0 @@ -[package] - -name = "html5ever" -version = "0.0.0" -authors = [ "The html5ever Project Developers" ] - -[lib] -name = "html5ever" - -# https://github.com/rust-lang/cargo/issues/1512 -doctest = false - -[dependencies] -time = "0" -log = "0" -phf = "0.7" -phf_macros = "0.7" -string_cache = "0.1.12" -string_cache_plugin = { version = "0.1.7", optional = true } - -[dependencies.mac] -git = "https://github.com/reem/rust-mac" - -[dependencies.html5ever_macros] -path = "macros" - -[dev-dependencies] -rustc-serialize = "0" - -[dev-dependencies.test_util] -path = "test_util" - -[dev-dependencies.html5ever_dom_sink] -path = "dom_sink" - -[profile.dev] -debug = false - -[profile.test] -debug = false diff --git a/html5ever-2015-05-15/capi/Cargo.toml b/html5ever-2015-05-15/capi/Cargo.toml deleted file mode 100644 index d665d7523..000000000 --- a/html5ever-2015-05-15/capi/Cargo.toml +++ /dev/null @@ -1,20 +0,0 @@ -[package] - -name = "html5ever_capi" -version = "0.0.0" -authors = [ "The html5ever Project Developers" ] - -[lib] -name = "html5ever_capi" -crate-type = ["staticlib"] - -[dependencies] -libc = "0" - -[dependencies.html5ever] -path = "../" - -[dependencies.string_cache] -git = "https://github.com/servo/string-cache" -[dependencies.string_cache_plugin] -git = "https://github.com/servo/string-cache" diff --git a/html5ever-2015-05-15/capi/include/html5ever.h b/html5ever-2015-05-15/capi/include/html5ever.h deleted file mode 100644 index e6c674d1c..000000000 --- a/html5ever-2015-05-15/capi/include/html5ever.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2014 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#ifndef __HTML5EVER_H -#define __HTML5EVER_H - -#include - -struct h5e_buf { - unsigned char *data; - size_t len; -}; - -struct h5e_buf h5e_buf_from_cstr(const char *str); - -struct h5e_token_ops { - void (*do_doctype)(void *user, struct h5e_buf name, - struct h5e_buf pub, struct h5e_buf sys, int force_quirks); - void (*do_start_tag)(void *user, struct h5e_buf name, - int self_closing, size_t num_attrs); - void (*do_tag_attr)(void *user, struct h5e_buf name, struct h5e_buf value); - void (*do_end_tag)(void *user, struct h5e_buf name); - void (*do_comment)(void *user, struct h5e_buf text); - void (*do_chars)(void *user, struct h5e_buf text); - void (*do_null_char)(void *user); - void (*do_eof)(void *user); - void (*do_error)(void *user, struct h5e_buf message); -}; - -struct h5e_token_sink { - struct h5e_token_ops *ops; - void *user; -}; - -struct h5e_tokenizer; - -struct h5e_tokenizer *h5e_tokenizer_new(struct h5e_token_sink *sink); -void h5e_tokenizer_free(struct h5e_tokenizer *tok); -void h5e_tokenizer_feed(struct h5e_tokenizer *tok, struct h5e_buf buf); -void h5e_tokenizer_end(struct h5e_tokenizer *tok); - -#endif diff --git a/html5ever-2015-05-15/capi/src/lib.rs b/html5ever-2015-05-15/capi/src/lib.rs deleted file mode 100644 index 2f1dbc621..000000000 --- a/html5ever-2015-05-15/capi/src/lib.rs +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright 2014-2015 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -extern crate libc; -extern crate string_cache; -extern crate html5ever; - -use std::{ptr, slice, str}; -use std::marker::PhantomData; -use std::borrow::Cow; - -use libc::{size_t, c_int, c_char, strlen}; - -use string_cache::Atom; - -#[repr(C)] -pub struct h5e_buf { - data: *const u8, - len: size_t, -} - -impl Copy for h5e_buf { } -impl Clone for h5e_buf { - fn clone(&self) -> h5e_buf { - *self - } -} - -impl h5e_buf { - pub fn null() -> h5e_buf { - h5e_buf { - data: ptr::null(), - len: 0, - } - } - - pub unsafe fn as_slice(&self) -> &str { - str::from_utf8_unchecked(slice::from_raw_parts(self.data, self.len as usize)) - } -} - -pub struct LifetimeBuf<'a> { - buf: h5e_buf, - marker: PhantomData<&'a [u8]>, -} - -impl<'a> LifetimeBuf<'a> { - pub fn from_str(x: &'a str) -> LifetimeBuf<'a> { - LifetimeBuf { - buf: h5e_buf { - data: x.as_bytes().as_ptr(), - len: x.len() as size_t, - }, - marker: PhantomData, - } - } - - pub fn null() -> LifetimeBuf<'a> { - LifetimeBuf { - buf: h5e_buf::null(), - marker: PhantomData, - } - } - - #[inline] - pub fn get(self) -> h5e_buf { - self.buf - } -} - -// Or we could just make `LifetimeBuf::from_str` generic over ; -// see rust-lang/rust#16738. -pub trait AsLifetimeBuf { - fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a>; -} - -impl AsLifetimeBuf for String { - fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> { - LifetimeBuf::from_str(self) - } -} - -impl AsLifetimeBuf for Atom { - fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> { - LifetimeBuf::from_str(self) - } -} - -impl<'b> AsLifetimeBuf for Cow<'b, str> { - fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> { - LifetimeBuf::from_str(self) - } -} - -#[no_mangle] -pub unsafe extern "C" fn h5e_buf_from_cstr(s: *const c_char) -> h5e_buf { - h5e_buf { - data: s as *const u8, - len: strlen(s), - } -} - -pub fn c_bool(x: bool) -> c_int { - match x { - false => 0, - true => 1, - } -} diff --git a/html5ever-2015-05-15/capi/src/tokenizer.rs b/html5ever-2015-05-15/capi/src/tokenizer.rs deleted file mode 100644 index d1fe7559b..000000000 --- a/html5ever-2015-05-15/capi/src/tokenizer.rs +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright 2014 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![allow(non_camel_case_types)] - -use {LifetimeBuf, AsLifetimeBuf, h5e_buf, c_bool}; - -use html5ever::tokenizer::{TokenSink, Token, Doctype, Tag, ParseError, DoctypeToken}; -use html5ever::tokenizer::{CommentToken, CharacterTokens, NullCharacterToken}; -use html5ever::tokenizer::{TagToken, StartTag, EndTag, EOFToken, Tokenizer}; - -use std::mem; -use std::default::Default; - -use libc::{c_void, c_int, size_t}; - -#[repr(C)] -pub struct h5e_token_ops { - do_doctype: Option, - - do_start_tag: Option, - - do_tag_attr: Option, - do_end_tag: Option, - do_comment: Option, - do_chars: Option, - do_null_char: Option, - do_eof: Option, - do_error: Option, -} - -impl Copy for h5e_token_ops { } -impl Clone for h5e_token_ops { - fn clone(&self) -> h5e_token_ops { - *self - } -} - -#[repr(C)] -pub struct h5e_token_sink { - ops: *const h5e_token_ops, - user: *mut c_void, -} - -impl Copy for h5e_token_sink { } -impl Clone for h5e_token_sink { - fn clone(&self) -> h5e_token_sink { - *self - } -} - -impl TokenSink for *mut h5e_token_sink { - fn process_token(&mut self, token: Token) { - macro_rules! call { - ($name:ident, $($arg:expr),*) => ( - unsafe { - match (*(**self).ops).$name { - None => (), - Some(f) => f((**self).user $(, $arg)*), - } - } - ); - ($name:ident) => (call!($name,)); // bleh - } - - fn opt_str_to_buf<'a>(s: &'a Option) -> LifetimeBuf<'a> { - match *s { - None => LifetimeBuf::null(), - Some(ref s) => s.as_lifetime_buf(), - } - } - - match token { - DoctypeToken(Doctype { name, public_id, system_id, force_quirks }) => { - let name = opt_str_to_buf(&name); - let public_id = opt_str_to_buf(&public_id); - let system_id = opt_str_to_buf(&system_id); - call!(do_doctype, name.get(), public_id.get(), system_id.get(), - c_bool(force_quirks)); - } - - TagToken(Tag { kind, name, self_closing, attrs }) => { - let name = name.as_lifetime_buf(); - match kind { - StartTag => { - call!(do_start_tag, name.get(), c_bool(self_closing), - attrs.len() as size_t); - for attr in attrs.into_iter() { - // All attribute names from the tokenizer are local. - assert!(attr.name.ns == ns!("")); - let name = attr.name.local.as_lifetime_buf(); - let value = attr.value.as_lifetime_buf(); - call!(do_tag_attr, name.get(), value.get()); - } - } - EndTag => call!(do_end_tag, name.get()), - } - } - - CommentToken(text) => { - let text = text.as_lifetime_buf(); - call!(do_comment, text.get()); - } - - CharacterTokens(text) => { - let text = text.as_lifetime_buf(); - call!(do_chars, text.get()); - } - - NullCharacterToken => call!(do_null_char), - - EOFToken => call!(do_eof), - - ParseError(msg) => { - let msg = msg.as_lifetime_buf(); - call!(do_error, msg.get()); - } - } - } -} - -pub type h5e_tokenizer_ptr = *const (); - -#[no_mangle] -pub unsafe extern "C" fn h5e_tokenizer_new(sink: *mut h5e_token_sink) -> h5e_tokenizer_ptr { - let tok: Box> - = box Tokenizer::new(sink, Default::default()); - - mem::transmute(tok) -} - -#[no_mangle] -pub unsafe extern "C" fn h5e_tokenizer_free(tok: h5e_tokenizer_ptr) { - let _: Box> = mem::transmute(tok); -} - -#[no_mangle] -pub unsafe extern "C" fn h5e_tokenizer_feed(tok: h5e_tokenizer_ptr, buf: h5e_buf) { - let tok: &mut Tokenizer<*mut h5e_token_sink> = mem::transmute(tok); - tok.feed(String::from_str(buf.as_slice())); -} - -#[no_mangle] -pub unsafe extern "C" fn h5e_tokenizer_end(tok: h5e_tokenizer_ptr) { - let tok: &mut Tokenizer<*mut h5e_token_sink> = mem::transmute(tok); - tok.end(); -} diff --git a/html5ever-2015-05-15/data/test/ignore b/html5ever-2015-05-15/data/test/ignore deleted file mode 100644 index 5f84915c6..000000000 --- a/html5ever-2015-05-15/data/test/ignore +++ /dev/null @@ -1,105 +0,0 @@ -tb: isindex.dat-0 -tb: isindex.dat-1 -tb: isindex.dat-2 -tb: ruby.dat-0 -tb: ruby.dat-1 -tb: ruby.dat-10 -tb: ruby.dat-12 -tb: ruby.dat-13 -tb: ruby.dat-15 -tb: ruby.dat-17 -tb: ruby.dat-2 -tb: ruby.dat-20 -tb: ruby.dat-3 -tb: ruby.dat-5 -tb: ruby.dat-7 -tb: tests16.dat-181 -tb: tests16.dat-183 -tb: tests16.dat-185 -tb: tests16.dat-194 -tb: tests16.dat-84 -tb: tests16.dat-86 -tb: tests16.dat-88 -tb: tests19.dat-10 -tb: tests19.dat-11 -tb: tests19.dat-18 -tb: tests19.dat-21 -tb: tests19.dat-7 -tb: tests19.dat-8 -tb: tests19.dat-9 -tb: tests2.dat-44 -tb: tests26.dat-9 -tb: tests5.dat-16 -tb: webkit02.dat-2 -tb: foreign-fragment.dat-0 -tb: foreign-fragment.dat-1 -tb: foreign-fragment.dat-18 -tb: foreign-fragment.dat-19 -tb: foreign-fragment.dat-2 -tb: foreign-fragment.dat-22 -tb: foreign-fragment.dat-23 -tb: foreign-fragment.dat-26 -tb: foreign-fragment.dat-27 -tb: foreign-fragment.dat-3 -tb: foreign-fragment.dat-30 -tb: foreign-fragment.dat-31 -tb: foreign-fragment.dat-34 -tb: foreign-fragment.dat-35 -tb: foreign-fragment.dat-38 -tb: foreign-fragment.dat-39 -tb: foreign-fragment.dat-40 -tb: foreign-fragment.dat-41 -tb: foreign-fragment.dat-47 -tb: foreign-fragment.dat-48 -tb: domjs-unsafe.dat-0 -tb: domjs-unsafe.dat-1 -tb: domjs-unsafe.dat-2 -tb: domjs-unsafe.dat-46 -tb: domjs-unsafe.dat-47 -tb: plain-text-unsafe.dat-10 -tb: plain-text-unsafe.dat-13 -tb: plain-text-unsafe.dat-26 -tb: plain-text-unsafe.dat-27 -tb: plain-text-unsafe.dat-28 -tb: plain-text-unsafe.dat-29 -tb: plain-text-unsafe.dat-30 -tb: plain-text-unsafe.dat-31 -tb: plain-text-unsafe.dat-32 -tb: tests10.dat-30 -tb: tests10.dat-31 -tb: tests10.dat-34 -tb: tests10.dat-35 -tb: tests12.dat-0 -tb: tests19.dat-35 -tb: tests19.dat-36 -tb: tests19.dat-37 -tb: tests19.dat-38 -tb: tests19.dat-39 -tb: tests19.dat-87 -tb: tests20.dat-34 -tb: tests20.dat-35 -tb: tests20.dat-36 -tb: tests20.dat-37 -tb: tests21.dat-0 -tb: tests21.dat-1 -tb: tests21.dat-10 -tb: tests21.dat-11 -tb: tests21.dat-12 -tb: tests21.dat-13 -tb: tests21.dat-14 -tb: tests21.dat-16 -tb: tests21.dat-17 -tb: tests21.dat-18 -tb: tests21.dat-19 -tb: tests21.dat-20 -tb: tests21.dat-21 -tb: tests21.dat-22 -tb: tests21.dat-23 -tb: tests21.dat-24 -tb: tests21.dat-3 -tb: tests21.dat-4 -tb: tests21.dat-5 -tb: tests21.dat-6 -tb: tests21.dat-7 -tb: tests21.dat-8 -tb: tests21.dat-9 diff --git a/html5ever-2015-05-15/dom_sink/Cargo.toml b/html5ever-2015-05-15/dom_sink/Cargo.toml deleted file mode 100644 index 10749c51f..000000000 --- a/html5ever-2015-05-15/dom_sink/Cargo.toml +++ /dev/null @@ -1,16 +0,0 @@ -[package] -name = "html5ever_dom_sink" -version = "0.0.0" -authors = [ "The html5ever Project Developers" ] - -[lib] -name = "html5ever_dom_sink" - -[dependencies.html5ever] -path = "../" - -[dependencies.mac] -git = "https://github.com/reem/rust-mac" - -[dependencies.string_cache] -git = "https://github.com/servo/string-cache" diff --git a/html5ever-2015-05-15/dom_sink/src/common.rs b/html5ever-2015-05-15/dom_sink/src/common.rs deleted file mode 100644 index 7fb1c6fec..000000000 --- a/html5ever-2015-05-15/dom_sink/src/common.rs +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2014 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use html5ever::tokenizer::Attribute; - -use string_cache::QualName; - -pub use self::NodeEnum::{Document, Doctype, Text, Comment, Element}; - -/// The different kinds of nodes in the DOM. -#[derive(Debug)] -pub enum NodeEnum { - /// The `Document` itself. - Document, - - /// A `DOCTYPE` with name, public id, and system id. - Doctype(String, String, String), - - /// A text node. - Text(String), - - /// A comment. - Comment(String), - - /// An element with attributes. - Element(QualName, Vec), -} diff --git a/html5ever-2015-05-15/dom_sink/src/lib.rs b/html5ever-2015-05-15/dom_sink/src/lib.rs deleted file mode 100644 index 8194715f7..000000000 --- a/html5ever-2015-05-15/dom_sink/src/lib.rs +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2014 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![crate_name="html5ever_dom_sink"] -#![crate_type="dylib"] - -#![feature(alloc, box_syntax, collections)] - -extern crate html5ever; - -#[macro_use] -extern crate string_cache; - -#[macro_use] -extern crate mac; - -pub mod common; -pub mod rcdom; -pub mod owned_dom; diff --git a/html5ever-2015-05-15/dom_sink/src/owned_dom.rs b/html5ever-2015-05-15/dom_sink/src/owned_dom.rs deleted file mode 100644 index 9b4781658..000000000 --- a/html5ever-2015-05-15/dom_sink/src/owned_dom.rs +++ /dev/null @@ -1,388 +0,0 @@ -// Copyright 2014 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! A simple DOM where every node is owned by its parent. -//! -//! Since ownership is more complicated during parsing, we actually -//! build a different type and then transmute to the public `Node`. -//! This is believed to be memory safe, but if you want to be extra -//! careful you can use `RcDom` instead. -//! -//! **Warning: Unstable.** This module uses unsafe code, has not -//! been thoroughly audited, and the performance gains vs. RcDom -//! have not been demonstrated. - -use common::{NodeEnum, Document, Doctype, Text, Comment, Element}; - -use html5ever::tokenizer::Attribute; -use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText}; -use html5ever::tree_builder; -use html5ever::serialize::{Serializable, Serializer}; -use html5ever::serialize::TraversalScope; -use html5ever::serialize::TraversalScope::{IncludeNode, ChildrenOnly}; -use html5ever::driver::ParseResult; - -use std::{mem, ptr}; -use std::cell::UnsafeCell; -use std::default::Default; -use std::mem::transmute; -use std::borrow::Cow; -use std::io::{self, Write}; -use std::collections::HashSet; -use std::ops::{Deref, DerefMut}; - -use string_cache::QualName; - -/// The internal type we use for nodes during parsing. -pub struct SquishyNode { - node: NodeEnum, - parent: Handle, - children: Vec, -} - -impl SquishyNode { - fn new(node: NodeEnum) -> SquishyNode { - SquishyNode { - node: node, - parent: Handle::null(), - children: vec!(), - } - } -} - -pub struct Handle { - ptr: *const UnsafeCell, -} - -impl Handle { - fn new(ptr: *const UnsafeCell) -> Handle { - Handle { - ptr: ptr, - } - } - - fn null() -> Handle { - Handle::new(ptr::null()) - } - - fn is_null(&self) -> bool { - self.ptr.is_null() - } -} - -impl PartialEq for Handle { - fn eq(&self, other: &Handle) -> bool { - self.ptr == other.ptr - } -} - -impl Eq for Handle { } - -impl Clone for Handle { - fn clone(&self) -> Handle { - Handle::new(self.ptr) - } -} - -impl Copy for Handle { } - -// The safety of `Deref` and `DerefMut` depends on the invariant that `Handle`s -// can't escape the `Sink`, because nodes are deallocated by consuming the -// `Sink`. - -impl DerefMut for Handle { - fn deref_mut<'a>(&'a mut self) -> &'a mut SquishyNode { - unsafe { - transmute::<_, &'a mut SquishyNode>((*self.ptr).get()) - } - } -} - -impl Deref for Handle { - type Target = SquishyNode; - fn deref<'a>(&'a self) -> &'a SquishyNode { - unsafe { - transmute::<_, &'a SquishyNode>((*self.ptr).get()) - } - } -} - -fn append(mut new_parent: Handle, mut child: Handle) { - new_parent.children.push(child); - let parent = &mut child.parent; - assert!(parent.is_null()); - *parent = new_parent -} - -fn get_parent_and_index(child: Handle) -> Option<(Handle, usize)> { - if child.parent.is_null() { - return None; - } - - let to_find = child; - match child.parent.children.iter().enumerate().find(|&(_, n)| *n == to_find) { - Some((i, _)) => Some((child.parent, i)), - None => panic!("have parent but couldn't find in parent's children!"), - } -} - -fn append_to_existing_text(mut prev: Handle, text: &str) -> bool { - match prev.deref_mut().node { - Text(ref mut existing) => { - existing.push_str(text); - true - } - _ => false, - } -} - -pub struct Sink { - nodes: Vec>>, - document: Handle, - errors: Vec>, - quirks_mode: QuirksMode, -} - -impl Default for Sink { - fn default() -> Sink { - let mut sink = Sink { - nodes: vec!(), - document: Handle::null(), - errors: vec!(), - quirks_mode: tree_builder::NoQuirks, - }; - sink.document = sink.new_node(Document); - sink - } -} - -impl Sink { - fn new_node(&mut self, node: NodeEnum) -> Handle { - self.nodes.push(box UnsafeCell::new(SquishyNode::new(node))); - let ptr: *const UnsafeCell = &**self.nodes.last().unwrap(); - Handle::new(ptr) - } - - // FIXME(rust-lang/rust#18296): This is separate from remove_from_parent so - // we can call it. - fn unparent(&mut self, mut target: Handle) { - let (mut parent, i) = unwrap_or_return!(get_parent_and_index(target), ()); - parent.children.remove(i); - target.parent = Handle::null(); - } -} - -impl TreeSink for Sink { - type Handle = Handle; - - fn parse_error(&mut self, msg: Cow<'static, str>) { - self.errors.push(msg); - } - - fn get_document(&mut self) -> Handle { - self.document - } - - fn set_quirks_mode(&mut self, mode: QuirksMode) { - self.quirks_mode = mode; - } - - fn same_node(&self, x: Handle, y: Handle) -> bool { - x == y - } - - fn elem_name(&self, target: Handle) -> QualName { - match target.node { - Element(ref name, _) => name.clone(), - _ => panic!("not an element!"), - } - } - - fn create_element(&mut self, name: QualName, attrs: Vec) -> Handle { - self.new_node(Element(name, attrs)) - } - - fn create_comment(&mut self, text: String) -> Handle { - self.new_node(Comment(text)) - } - - fn append(&mut self, parent: Handle, child: NodeOrText) { - // Append to an existing Text node if we have one. - match child { - AppendText(ref text) => match parent.children.last() { - Some(h) => if append_to_existing_text(*h, &text) { return; }, - _ => (), - }, - _ => (), - } - - append(parent, match child { - AppendText(text) => self.new_node(Text(text)), - AppendNode(node) => node - }); - } - - fn append_before_sibling(&mut self, - sibling: Handle, - child: NodeOrText) -> Result<(), NodeOrText> { - let (mut parent, i) = unwrap_or_return!(get_parent_and_index(sibling), Err(child)); - - let mut child = match (child, i) { - // No previous node. - (AppendText(text), 0) => self.new_node(Text(text)), - - // Look for a text node before the insertion point. - (AppendText(text), i) => { - let prev = parent.children[i-1]; - if append_to_existing_text(prev, &text) { - return Ok(()); - } - self.new_node(Text(text)) - } - - // The tree builder promises we won't have a text node after - // the insertion point. - - // Any other kind of node. - (AppendNode(node), _) => node, - }; - - if !child.parent.is_null() { - self.unparent(child); - } - - child.parent = parent; - parent.children.insert(i, child); - Ok(()) - } - - fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) { - append(self.document, self.new_node(Doctype(name, public_id, system_id))); - } - - fn add_attrs_if_missing(&mut self, mut target: Handle, mut attrs: Vec) { - let existing = match target.deref_mut().node { - Element(_, ref mut attrs) => attrs, - _ => return, - }; - - // FIXME: quadratic time - attrs.retain(|attr| - !existing.iter().any(|e| e.name == attr.name)); - existing.extend(attrs.into_iter()); - } - - fn remove_from_parent(&mut self, target: Handle) { - self.unparent(target); - } - - fn reparent_children(&mut self, mut node: Handle, mut new_parent: Handle) { - new_parent.children.append(&mut node.children); - } - - fn mark_script_already_started(&mut self, _node: Handle) { } -} - -pub struct Node { - pub node: NodeEnum, - _parent_not_accessible: usize, - pub children: Vec>, -} - -pub struct OwnedDom { - pub document: Box, - pub errors: Vec>, - pub quirks_mode: QuirksMode, -} - -impl ParseResult for OwnedDom { - type Sink = Sink; - - fn get_result(sink: Sink) -> OwnedDom { - fn walk(live: &mut HashSet, node: Handle) { - live.insert(node.ptr as usize); - for &child in node.deref().children.iter() { - walk(live, child); - } - } - - // Collect addresses of all the nodes that made it into the final tree. - let mut live = HashSet::new(); - walk(&mut live, sink.document); - - // Forget about the nodes in the final tree; they will be owned by - // their parent. In the process of iterating we drop all nodes that - // aren't in the tree. - for node in sink.nodes.into_iter() { - let ptr: *const UnsafeCell = &*node; - if live.contains(&(ptr as usize)) { - unsafe { - mem::forget(node); - } - } - } - - let old_addrs = addrs_of!(sink.document => node, parent, children); - - // Transmute the root to a Node, finalizing the transfer of ownership. - let document = unsafe { - mem::transmute::<*const UnsafeCell, Box>(sink.document.ptr) - }; - - // FIXME: do this assertion statically - let new_addrs = addrs_of!(document => node, _parent_not_accessible, children); - assert_eq!(old_addrs, new_addrs); - - OwnedDom { - document: document, - errors: sink.errors, - quirks_mode: sink.quirks_mode, - } - } -} - -impl Serializable for Node { - fn serialize<'wr, Wr: Write>(&self, - serializer: &mut Serializer<'wr, Wr>, - traversal_scope: TraversalScope) -> io::Result<()> { - - match (traversal_scope, &self.node) { - (_, &Element(ref name, ref attrs)) => { - if traversal_scope == IncludeNode { - try!(serializer.start_elem(name.clone(), - attrs.iter().map(|at| (&at.name, &at.value[..])))); - } - - for child in self.children.iter() { - try!(child.serialize(serializer, IncludeNode)); - } - - if traversal_scope == IncludeNode { - try!(serializer.end_elem(name.clone())); - } - Ok(()) - } - - (ChildrenOnly, &Document) => { - for child in self.children.iter() { - try!(child.serialize(serializer, IncludeNode)); - } - Ok(()) - } - - (ChildrenOnly, _) => Ok(()), - - (IncludeNode, &Doctype(ref name, _, _)) => serializer.write_doctype(&name), - (IncludeNode, &Text(ref text)) => serializer.write_text(&text), - (IncludeNode, &Comment(ref text)) => serializer.write_comment(&text), - - (IncludeNode, &Document) => panic!("Can't serialize Document node itself"), - } - } -} diff --git a/html5ever-2015-05-15/macros/Cargo.toml b/html5ever-2015-05-15/macros/Cargo.toml deleted file mode 100644 index 9bcecfbde..000000000 --- a/html5ever-2015-05-15/macros/Cargo.toml +++ /dev/null @@ -1,16 +0,0 @@ -[package] - -name = "html5ever_macros" -version = "0.0.0" -authors = [ "The html5ever Project Developers" ] - -[lib] - -name = "html5ever_macros" -plugin = true - -[dependencies] -rustc-serialize = "0" - -[dependencies.mac] -git = "https://github.com/reem/rust-mac" diff --git a/html5ever-2015-05-15/macros/src/named_entities.rs b/html5ever-2015-05-15/macros/src/named_entities.rs deleted file mode 100644 index f31924d6d..000000000 --- a/html5ever-2015-05-15/macros/src/named_entities.rs +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright 2014 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![allow(unused_imports)] // for quotes - -use std::path::PathBuf; -use std::fs; -use std::str::FromStr; -use std::collections::HashMap; -use std::convert::From; - -use rustc_serialize::json; -use rustc_serialize::json::Json; -use rustc_serialize::Decodable; -use syntax::codemap::Span; -use syntax::ast::{Path, ExprLit, Lit_, TokenTree}; -use syntax::parse::token; -use syntax::ext::base::{ExtCtxt, MacResult, MacEager}; -use syntax::ext::source_util::expand_file; - -// A struct matching the entries in entities.json. -// Simplifies JSON parsing because we can use Decodable. -#[derive(RustcDecodable)] -struct CharRef { - codepoints: Vec, - //characters: String, // Present in the file but we don't need it -} - -// Build the map from entity names (and their prefixes) to characters. -fn build_map(js: Json) -> Option> { - let mut map = HashMap::new(); - let json_map = match js { - Json::Object(m) => m, - _ => return None, - }; - - // Add every named entity to the map. - for (k,v) in json_map.into_iter() { - let mut decoder = json::Decoder::new(v); - let CharRef { codepoints }: CharRef - = Decodable::decode(&mut decoder).ok().expect("bad CharRef"); - - assert!((codepoints.len() >= 1) && (codepoints.len() <= 2)); - let mut codepoint_pair = [0, 0]; - for (i,n) in codepoints.into_iter().enumerate() { - codepoint_pair[i] = n; - } - - // Slice off the initial '&' - assert!(k.chars().next() == Some('&')); - map.insert(k[1..].to_string(), codepoint_pair); - } - - // Add every missing prefix of those keys, mapping to NULL characters. - map.insert("".to_string(), [0, 0]); - let keys: Vec = map.keys().map(|k| k.to_string()).collect(); - for k in keys.into_iter() { - for n in 1 .. k.len() { - let pfx = k[..n].to_string(); - if !map.contains_key(&pfx) { - map.insert(pfx, [0, 0]); - } - } - } - - Some(map) -} - -// Expand named_entities!("path/to/entities.json") into an invocation of phf_map!(). -pub fn expand(cx: &mut ExtCtxt, sp: Span, tt: &[TokenTree]) -> Box { - let usage = "Usage: named_entities!(\"path/to/entities.json\")"; - - // Argument to the macro should be a single literal string: a path to - // entities.json, relative to the file containing the macro invocation. - let json_filename = match tt { - [TokenTree::Token(_, token::Literal(token::Lit::Str_(s), _))] => s.as_str().to_string(), - _ => ext_bail!(cx, sp, usage), - }; - - // Get the result of calling file!() in the same place as our macro. - let mod_filename = ext_expect!(cx, sp, match expand_file(cx, sp, &[]).make_expr() { - Some(e) => match e.node { - ExprLit(ref s) => match s.node { - Lit_::LitStr(ref s, _) => Some(s.to_string()), - _ => None, - }, - _ => None, - }, - _ => None, - }, "unexpected result from file!()"); - - // Combine those to get an absolute path to entities.json. - let mut path: PathBuf = From::from(&mod_filename); - path.pop(); - path.push(&json_filename); - - // Open the JSON file, parse it, and build the map from names to characters. - let mut json_file = ext_expect!(cx, sp, fs::File::open(&path).ok(), - "can't open JSON file"); - let js = ext_expect!(cx, sp, Json::from_reader(&mut json_file).ok(), - "can't parse JSON file"); - let map = ext_expect!(cx, sp, build_map(js), - "JSON file does not match entities.json format"); - - // Emit a macro invocation of the form - // - // phf_map!(k => v, k => v, ...) - let toks: Vec<_> = map.into_iter().flat_map(|(k, [c0, c1])| { - let k = &k[..]; - (quote_tokens!(&mut *cx, $k => [$c0, $c1],)).into_iter() - }).collect(); - MacEager::expr(quote_expr!(&mut *cx, phf_map!($toks))) -} diff --git a/html5ever-2015-05-15/src/driver.rs b/html5ever-2015-05-15/src/driver.rs deleted file mode 100644 index 6fb359ded..000000000 --- a/html5ever-2015-05-15/src/driver.rs +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright 2014 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! High-level interface to the parser. - -use tokenizer::{TokenizerOpts, Tokenizer, TokenSink}; -use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink}; - -use std::option; -use std::default::Default; - -use string_cache::{Atom, QualName}; - -/// Convenience function to turn a single `String` into an iterator. -pub fn one_input(x: String) -> option::IntoIter { - Some(x).into_iter() -} - -/// Tokenize and send results to a `TokenSink`. -/// -/// ## Example -/// -/// ```ignore -/// let mut sink = MySink; -/// tokenize_to(&mut sink, one_input(my_str), Default::default()); -/// ``` -pub fn tokenize_to< - Sink: TokenSink, - It: Iterator - >( - sink: Sink, - input: It, - opts: TokenizerOpts) -> Sink { - - let mut tok = Tokenizer::new(sink, opts); - for s in input { - tok.feed(s); - } - tok.end(); - tok.unwrap() -} - -/// All-encompassing options struct for the parser. -#[derive(Clone, Default)] -pub struct ParseOpts { - /// Tokenizer options. - pub tokenizer: TokenizerOpts, - - /// Tree builder options. - pub tree_builder: TreeBuilderOpts, -} - -/// Parse and send results to a `TreeSink`. -/// -/// ## Example -/// -/// ```ignore -/// let mut sink = MySink; -/// parse_to(&mut sink, one_input(my_str), Default::default()); -/// ``` -pub fn parse_to< - Sink: TreeSink, - It: Iterator - >( - sink: Sink, - input: It, - opts: ParseOpts) -> Sink { - - let tb = TreeBuilder::new(sink, opts.tree_builder); - let mut tok = Tokenizer::new(tb, opts.tokenizer); - for s in input { - tok.feed(s); - } - tok.end(); - tok.unwrap().unwrap() -} - -/// Parse an HTML fragment and send results to a `TreeSink`. -/// -/// ## Example -/// -/// ```ignore -/// let mut sink = MySink; -/// parse_fragment_to(&mut sink, one_input(my_str), context_token, Default::default()); -/// ``` -pub fn parse_fragment_to< - Sink: TreeSink, - It: Iterator - >( - sink: Sink, - input: It, - context: Atom, - opts: ParseOpts) -> Sink { - - let mut sink = sink; - let context_elem = sink.create_element(QualName::new(ns!(HTML), context), vec!()); - let tb = TreeBuilder::new_for_fragment(sink, context_elem, None, opts.tree_builder); - let tok_opts = TokenizerOpts { - initial_state: Some(tb.tokenizer_state_for_context_elem()), - .. opts.tokenizer - }; - let mut tok = Tokenizer::new(tb, tok_opts); - for s in input { - tok.feed(s); - } - tok.end(); - tok.unwrap().unwrap() -} - -/// Results which can be extracted from a `TreeSink`. -/// -/// Implement this for your parse tree data type so that it -/// can be returned by `parse()`. -pub trait ParseResult { - type Sink: TreeSink + Default; - fn get_result(sink: Self::Sink) -> Self; -} - -/// Parse into a type which implements `ParseResult`. -/// -/// ## Example -/// -/// ```ignore -/// let dom: RcDom = parse(one_input(my_str), Default::default()); -/// ``` -pub fn parse(input: It, opts: ParseOpts) -> Output - where Output: ParseResult, - It: Iterator, -{ - let sink = parse_to(Default::default(), input, opts); - ParseResult::get_result(sink) -} - -/// Parse an HTML fragment into a type which implements `ParseResult`. -/// -/// ## Example -/// -/// ```ignore -/// let dom: RcDom = parse_fragment(one_input(my_str), context_token, Default::default()); -/// ``` -pub fn parse_fragment(input: It, context: Atom, opts: ParseOpts) -> Output - where Output: ParseResult, - It: Iterator, -{ - let sink = parse_fragment_to(Default::default(), input, context, opts); - ParseResult::get_result(sink) -} diff --git a/html5ever-2015-05-15/src/tree_builder/tag_sets.rs b/html5ever-2015-05-15/src/tree_builder/tag_sets.rs deleted file mode 100644 index 4ee786708..000000000 --- a/html5ever-2015-05-15/src/tree_builder/tag_sets.rs +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2014 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! Various sets of HTML tag names, and macros for declaring them. - -use string_cache::QualName; - -macro_rules! declare_tag_set_impl ( ($param:ident, $b:ident, $supr:ident, $($tag:tt)+) => ( - match $param { - $( qualname!(HTML, $tag) => $b, )+ - p => $supr(p), - } -)); - -macro_rules! declare_tag_set_body ( - ($param:ident = $supr:ident - $($tag:tt)+) - => ( declare_tag_set_impl!($param, false, $supr, $($tag)+) ); - - ($param:ident = $supr:ident + $($tag:tt)+) - => ( declare_tag_set_impl!($param, true, $supr, $($tag)+) ); - - ($param:ident = $($tag:tt)+) - => ( declare_tag_set_impl!($param, true, empty_set, $($tag)+) ); -); - -macro_rules! declare_tag_set ( - (pub $name:ident = $($toks:tt)+) => ( - pub fn $name(p: ::string_cache::QualName) -> bool { - declare_tag_set_body!(p = $($toks)+) - } - ); - - ($name:ident = $($toks:tt)+) => ( - fn $name(p: ::string_cache::QualName) -> bool { - declare_tag_set_body!(p = $($toks)+) - } - ); -); - -#[inline(always)] pub fn empty_set(_: QualName) -> bool { false } -#[inline(always)] pub fn full_set(_: QualName) -> bool { true } - -declare_tag_set!(pub default_scope = applet caption html table td th marquee object template); - -declare_tag_set!(pub list_item_scope = default_scope + ol ul); -declare_tag_set!(pub button_scope = default_scope + button); -declare_tag_set!(pub table_scope = html table template); -declare_tag_set!(pub select_scope = full_set - optgroup option); - -declare_tag_set!(pub table_body_context = tbody tfoot thead template html); -declare_tag_set!(pub table_row_context = tr template html); -declare_tag_set!(pub td_th = td th); - -declare_tag_set!(pub cursory_implied_end = dd dt li option optgroup p rp rt); - -declare_tag_set!(pub thorough_implied_end = cursory_implied_end - + caption colgroup tbody td tfoot th thead tr); - -declare_tag_set!(pub heading_tag = h1 h2 h3 h4 h5 h6); - -declare_tag_set!(pub special_tag = - address applet area article aside base basefont bgsound blockquote body br button caption - center col colgroup dd details dir div dl dt embed fieldset figcaption figure footer form - frame frameset h1 h2 h3 h4 h5 h6 head header hgroup hr html iframe img input isindex li - link listing main marquee menu menuitem meta nav noembed noframes noscript object ol p - param plaintext pre script section select source style summary table tbody td template - textarea tfoot th thead title tr track ul wbr xmp); -//§ END - -pub fn mathml_text_integration_point(p: QualName) -> bool { - matches!(p, qualname!(MathML, mi) | qualname!(MathML, mo) | qualname!(MathML, mn) - | qualname!(MathML, ms) | qualname!(MathML, mtext)) -} - -pub fn html_integration_point(p: QualName) -> bool { - // FIXME(#119): HTML integration points in MathML - matches!(p, qualname!(SVG, foreignObject) | qualname!(SVG, desc) - | qualname!(SVG, title)) -} diff --git a/html5ever-2015-05-15/src/util/str.rs b/html5ever-2015-05-15/src/util/str.rs deleted file mode 100644 index 70a5c19bf..000000000 --- a/html5ever-2015-05-15/src/util/str.rs +++ /dev/null @@ -1,240 +0,0 @@ -// Copyright 2014 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use std::fmt; - -pub fn to_escaped_string(x: &T) -> String { - use std::fmt::Write; - - // FIXME: don't allocate twice - let mut buf = String::new(); - let _ = buf.write_fmt(format_args!("{:?}", x)); - buf.shrink_to_fit(); - buf.escape_default() -} - -// FIXME: The ASCII stuff is largely copied from std::ascii -// (see rust-lang/rust#16801). - -pub static ASCII_LOWER_MAP: [u8; 256] = [ - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, - 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, - b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'', - b'(', b')', b'*', b'+', b',', b'-', b'.', b'/', - b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', - b'8', b'9', b':', b';', b'<', b'=', b'>', b'?', - b'@', - - b'a', b'b', b'c', b'd', b'e', b'f', b'g', - b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', - b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', - b'x', b'y', b'z', - - b'[', b'\\', b']', b'^', b'_', - b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g', - b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', - b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', - b'x', b'y', b'z', b'{', b'|', b'}', b'~', 0x7f, - 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, - 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, - 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, - 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, - 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, - 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, - 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, - 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, - 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, - 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, - 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, - 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, - 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, - 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, - 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, - 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, -]; - -#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] -pub struct Ascii { - chr: u8, -} - -impl Ascii { - pub fn to_char(self) -> char { - self.chr as char - } - - #[inline] - pub fn is_alphabetic(&self) -> bool { - (self.chr >= 0x41 && self.chr <= 0x5A) || (self.chr >= 0x61 && self.chr <= 0x7A) - } - - #[inline] - pub fn is_digit(&self) -> bool { - self.chr >= 0x30 && self.chr <= 0x39 - } - - #[inline] - pub fn is_alphanumeric(&self) -> bool { - self.is_alphabetic() || self.is_digit() - } - - #[inline] - pub fn to_lowercase(self) -> Ascii { - Ascii { chr: ASCII_LOWER_MAP[self.chr as usize] } - } - - #[inline] - pub fn eq_ignore_case(self, other: Ascii) -> bool { - ASCII_LOWER_MAP[self.chr as usize] == ASCII_LOWER_MAP[other.chr as usize] - } -} - -pub trait AsciiCast { - fn to_ascii_opt(&self) -> Option; -} - -impl AsciiCast for char { - fn to_ascii_opt(&self) -> Option { - let n = *self as u32; - if n < 0x80 { - Some(Ascii { chr: n as u8 }) - } else { - None - } - } -} - -pub trait AsciiExt { - fn to_ascii_lower(&self) -> T; - fn eq_ignore_ascii_case(&self, other: Self) -> bool; -} - -impl<'a> AsciiExt> for &'a [u8] { - #[inline] - fn to_ascii_lower(&self) -> Vec { - self.iter().map(|&byte| ASCII_LOWER_MAP[byte as usize]).collect() - } - - #[inline] - fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool { - self.len() == other.len() && self.iter().zip(other.iter()).all( - |(byte_self, byte_other)| { - ASCII_LOWER_MAP[*byte_self as usize] == - ASCII_LOWER_MAP[*byte_other as usize] - } - ) - } -} - -impl<'a> AsciiExt for &'a str { - #[inline] - fn to_ascii_lower(&self) -> String { - // Vec::to_ascii_lower() preserves the UTF-8 invariant. - unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_lower()) } - } - - #[inline] - fn eq_ignore_ascii_case(&self, other: &str) -> bool { - self.as_bytes().eq_ignore_ascii_case(other.as_bytes()) - } -} - -/// If `c` is an ASCII letter, return the corresponding lowercase -/// letter, otherwise None. -pub fn lower_ascii_letter(c: char) -> Option { - match c.to_ascii_opt() { - Some(a) => if a.is_alphabetic() { Some(a.to_lowercase().to_char()) } else { None }, - _ => None, - } -} - -/// Map ASCII uppercase to lowercase; preserve other characters. -pub fn lower_ascii(c: char) -> char { - lower_ascii_letter(c).unwrap_or(c) -} - -/// Is the character an ASCII alphanumeric character? -pub fn is_ascii_alnum(c: char) -> bool { - c.to_ascii_opt().map_or(false, |a| a.is_alphanumeric()) -} - -/// Allocate an empty string with a small non-zero capacity. -pub fn empty_str() -> String { - String::with_capacity(4) -} - -/// ASCII whitespace characters, as defined by -/// tree construction modes that treat them specially. -pub fn is_ascii_whitespace(c: char) -> bool { - match c { - '\t' | '\r' | '\n' | '\x0C' | ' ' => true, - _ => false, - } -} - -/// Count how many bytes at the beginning of the string -/// either all match or all don't match the predicate, -/// and also return whether they match. -/// -/// Returns `None` on an empty string. -pub fn char_run(mut pred: Pred, buf: &str) -> Option<(usize, bool)> - where Pred: FnMut(char) -> bool, -{ - let (first, rest) = unwrap_or_return!(buf.slice_shift_char(), None); - let matches = pred(first); - - for (idx, ch) in rest.char_indices() { - if matches != pred(ch) { - return Some((idx + first.len_utf8(), matches)); - } - } - Some((buf.len(), matches)) -} - -#[cfg(test)] -#[allow(non_snake_case)] -mod test { - use super::{char_run, is_ascii_whitespace, is_ascii_alnum, lower_ascii, lower_ascii_letter}; - - test_eq!(lower_letter_a_is_a, lower_ascii_letter('a'), Some('a')); - test_eq!(lower_letter_A_is_a, lower_ascii_letter('A'), Some('a')); - test_eq!(lower_letter_symbol_is_None, lower_ascii_letter('!'), None); - test_eq!(lower_letter_nonascii_is_None, lower_ascii_letter('\u{a66e}'), None); - - test_eq!(lower_a_is_a, lower_ascii('a'), 'a'); - test_eq!(lower_A_is_a, lower_ascii('A'), 'a'); - test_eq!(lower_symbol_unchanged, lower_ascii('!'), '!'); - test_eq!(lower_nonascii_unchanged, lower_ascii('\u{a66e}'), '\u{a66e}'); - - test_eq!(is_alnum_a, is_ascii_alnum('a'), true); - test_eq!(is_alnum_A, is_ascii_alnum('A'), true); - test_eq!(is_alnum_1, is_ascii_alnum('1'), true); - test_eq!(is_not_alnum_symbol, is_ascii_alnum('!'), false); - test_eq!(is_not_alnum_nonascii, is_ascii_alnum('\u{a66e}'), false); - - macro_rules! test_char_run ( ($name:ident, $input:expr, $expect:expr) => ( - test_eq!($name, char_run(is_ascii_whitespace, $input), $expect); - )); - - test_char_run!(run_empty, "", None); - test_char_run!(run_one_t, " ", Some((1, true))); - test_char_run!(run_one_f, "x", Some((1, false))); - test_char_run!(run_t, " \t \n", Some((6, true))); - test_char_run!(run_f, "xyzzy", Some((5, false))); - test_char_run!(run_tf, " xyzzy", Some((3, true))); - test_char_run!(run_ft, "xyzzy ", Some((5, false))); - test_char_run!(run_tft, " xyzzy ", Some((3, true))); - test_char_run!(run_ftf, "xyzzy hi", Some((5, false))); - test_char_run!(run_multibyte_0, "中 ", Some((3, false))); - test_char_run!(run_multibyte_1, " 中 ", Some((1, true))); - test_char_run!(run_multibyte_2, " 中 ", Some((2, true))); - test_char_run!(run_multibyte_3, " 中 ", Some((3, true))); -} diff --git a/html5ever-2015-05-15/test_util/Cargo.toml b/html5ever-2015-05-15/test_util/Cargo.toml deleted file mode 100644 index 9b682e86f..000000000 --- a/html5ever-2015-05-15/test_util/Cargo.toml +++ /dev/null @@ -1,5 +0,0 @@ -[package] - -name = "test_util" -version = "0.0.0" -authors = [ "The html5ever Project Developers" ] diff --git a/html5ever-2016-08-25/.gitignore b/html5ever-2016-08-25/.gitignore new file mode 100644 index 000000000..d7bea00ac --- /dev/null +++ b/html5ever-2016-08-25/.gitignore @@ -0,0 +1,4 @@ +/data/bench/uncommitted +target +Cargo.lock +*.racertmp diff --git a/html5ever-2016-08-25/.gitmodules b/html5ever-2016-08-25/.gitmodules new file mode 100644 index 000000000..890f7f98d --- /dev/null +++ b/html5ever-2016-08-25/.gitmodules @@ -0,0 +1,3 @@ +[submodule "html5lib-tests"] + path = html5lib-tests + url = https://github.com/html5lib/html5lib-tests diff --git a/html5ever-2016-08-25/.travis.yml b/html5ever-2016-08-25/.travis.yml new file mode 100644 index 000000000..af90c545b --- /dev/null +++ b/html5ever-2016-08-25/.travis.yml @@ -0,0 +1,13 @@ +branches: + except: + - servo + +language: rust +rust: + - nightly + - beta + - stable +script: scripts/travis-build.sh + +notifications: + webhooks: http://build.servo.org:54856/travis diff --git a/html5ever-2016-08-25/AUTHORS b/html5ever-2016-08-25/AUTHORS new file mode 100644 index 000000000..fa304ee07 --- /dev/null +++ b/html5ever-2016-08-25/AUTHORS @@ -0,0 +1,20 @@ +This software was written by the following people: + +Adam Roben +Akos Kiss +Wojciech "Zarazek" Wiśniewski +Chris Paris +Clark Gaebel +Daniel Fath +Huon Wilson +glennw +Josh Matthews +György Andrasek +Keegan McAllister +Eunchong Yu +Manish Goregaokar +Chris Morgan +Mátyás Mustoha +Patrick Walton +Renato Zannon +Simon Sapin diff --git a/html5ever-2015-05-15/COPYRIGHT b/html5ever-2016-08-25/COPYRIGHT similarity index 100% rename from html5ever-2015-05-15/COPYRIGHT rename to html5ever-2016-08-25/COPYRIGHT diff --git a/html5ever-2016-08-25/Cargo.toml b/html5ever-2016-08-25/Cargo.toml new file mode 100644 index 000000000..b353ff971 --- /dev/null +++ b/html5ever-2016-08-25/Cargo.toml @@ -0,0 +1,60 @@ +[package] + +name = "html5ever" +version = "0.5.4" +authors = [ "The html5ever Project Developers" ] +license = "MIT / Apache-2.0" +repository = "https://github.com/servo/html5ever" +description = "High-performance browser-grade HTML5 parser" +documentation = "https://kmcallister.github.io/docs/html5ever/html5ever/index.html" +build = "build.rs" + +[lib] +name = "html5ever" + +# https://github.com/rust-lang/cargo/issues/1512 +doctest = false + +[[test]] +name = "tree_builder" +harness = false + +[[test]] +name = "tokenizer" +harness = false + +[[test]] +name = "serializer" + +[[bench]] +name = "tokenizer" +harness = false + +[features] +unstable = ["tendril/unstable", "string_cache/unstable"] +heap_size = ["heapsize", "heapsize_plugin"] +codegen = ["html5ever_macros"] + +[dependencies] +log = "0" +phf = "0.7" +string_cache = "0.2.0" +mac = "0" +tendril = "0.2.2" +heapsize = { version = ">=0.1.1, <0.4", optional = true } +heapsize_plugin = { version = "0.1.0", optional = true } + +[dev-dependencies] +rustc-serialize = "0.3.15" +rustc-test = "0.1.3" + +[build-dependencies] +phf_codegen = "0.7.3" +rustc-serialize = "0.3.15" +html5ever_macros = { version = "0.2.6", path = "macros", optional = true } + +[profile.dev] +debug = false + +[profile.test] +debug = false diff --git a/html5ever-2015-05-15/LICENSE-APACHE b/html5ever-2016-08-25/LICENSE-APACHE similarity index 100% rename from html5ever-2015-05-15/LICENSE-APACHE rename to html5ever-2016-08-25/LICENSE-APACHE diff --git a/html5ever-2015-05-15/LICENSE-MIT b/html5ever-2016-08-25/LICENSE-MIT similarity index 100% rename from html5ever-2015-05-15/LICENSE-MIT rename to html5ever-2016-08-25/LICENSE-MIT diff --git a/html5ever-2015-05-15/README.md b/html5ever-2016-08-25/README.md similarity index 84% rename from html5ever-2015-05-15/README.md rename to html5ever-2016-08-25/README.md index 8a4c9e6bf..60c4d9384 100644 --- a/html5ever-2015-05-15/README.md +++ b/html5ever-2016-08-25/README.md @@ -18,16 +18,14 @@ html5ever is written in [Rust](http://www.rust-lang.org/), so it avoids the most Add html5ever as a dependency in your [`Cargo.toml`](http://crates.io/) file: ```toml -[dependencies.html5ever] -git = "https://github.com/servo/html5ever" +[dependencies] +html5ever = "*" ``` Then take a look at [`examples/print-rcdom.rs`](https://github.com/servo/html5ever/blob/master/examples/print-rcdom.rs) and the [API documentation][]. ## Getting started in other languages -The C API is not yet complete, but it's already possible to do [tokenization](http://mainisusuallyafunction.blogspot.com/2014/08/calling-rust-library-from-c-or-anything.html). - Bindings for Python and other languages are much desired. @@ -50,6 +48,6 @@ html5ever exclusively uses UTF-8 to represent strings. In the future it will su The code is cross-referenced with the WHATWG syntax spec, and eventually we will have a way to present code and spec side-by-side. -html5ever tracks Rust nightly, for now. Support for the stable Rust 1.x compilers is planned, however. +html5ever builds against the official stable releases of Rust, though some optimizations are only supported on nightly releases. -[API documentation]: https://kmcallister.github.io/docs/html5ever/html5ever/index.html +[API documentation]: http://doc.servo.org/html5ever/index.html diff --git a/html5ever-2015-05-15/STRUCTURE.md b/html5ever-2016-08-25/STRUCTURE.md similarity index 86% rename from html5ever-2015-05-15/STRUCTURE.md rename to html5ever-2016-08-25/STRUCTURE.md index fd30013af..00a7f33c0 100644 --- a/html5ever-2015-05-15/STRUCTURE.md +++ b/html5ever-2016-08-25/STRUCTURE.md @@ -12,16 +12,14 @@ The module structure is also documented in the output produced by `cargo doc`, a `src/serialize/`: Turning trees back into strings. Corresponds to [section 12.3 "Serialising HTML fragments"](https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments) -`src/sink/`: Types that html5ever can use to represent the DOM, if you do not provide your own DOM implementation. +`dom_sink/`: Types that html5ever can use to represent the DOM, if you do not provide your own DOM implementation. `macros/`: Rust syntax extensions used within html5ever. Users of the library do not need this crate. -`capi/`: Implementation of the C API for html5ever (as yet incomplete) - `tests/`: Integration tests. This is a single executable crate that runs html5ever on the various [html5lib-tests](https://github.com/html5lib/html5lib-tests). There are also unit tests throughout the library code. See `README.md` for information on running tests. `bench/`: Benchmarks. Another executable crate. -`examples/`: Examples of using the library. Each `.rs` file is an executable crate. +`examples/` and `dom_sink/examples`: Examples of using the library. Each `.rs` file is an executable crate. `data/`: Various data used in building and benchmarking the parser. diff --git a/html5ever-2016-08-25/benches/tokenizer.rs b/html5ever-2016-08-25/benches/tokenizer.rs new file mode 100644 index 000000000..824f5d0bb --- /dev/null +++ b/html5ever-2016-08-25/benches/tokenizer.rs @@ -0,0 +1,158 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate test; +extern crate tendril; +extern crate html5ever; + +use std::{fs, env, cmp}; +use std::path::PathBuf; +use std::io::Read; +use std::default::Default; + +use test::{black_box, Bencher, TestDesc, TestDescAndFn}; +use test::{DynTestName, DynBenchFn, TDynBenchFn}; +use test::ShouldPanic::No; + +use tendril::{ByteTendril, StrTendril, ReadExt, SliceExt}; +use html5ever::tokenizer::{TokenSink, Token, Tokenizer, TokenizerOpts}; + +struct Sink; + +impl TokenSink for Sink { + fn process_token(&mut self, token: Token) { + // Don't use the token, but make sure we don't get + // optimized out entirely. + black_box(token); + } +} + +// This could almost be the TokenSink too, but it's not +// mut within run(). +struct Bench { + input: Vec, + clone_only: bool, + opts: TokenizerOpts, +} + +/// All tendrils in Bench.input are owned. +unsafe impl Send for Bench {} + +impl Bench { + fn new(name: &str, size: Option, clone_only: bool, + opts: TokenizerOpts) -> Bench { + let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + path.push("data/bench/"); + path.push(name); + let mut file = fs::File::open(&path).ok().expect("can't open file"); + + // Read the file and treat it as an infinitely repeating sequence of characters. + let mut file_input = ByteTendril::new(); + file.read_to_tendril(&mut file_input).ok().expect("can't read file"); + let file_input: StrTendril = file_input.try_reinterpret().unwrap(); + let size = size.unwrap_or(file_input.len()); + let mut stream = file_input.chars().cycle(); + + // Break the input into chunks of 1024 chars (= a few kB). + // This simulates reading from the network. + let mut input = vec![]; + let mut total = 0usize; + while total < size { + // The by_ref() call is important, otherwise we get wrong results! + // See rust-lang/rust#18045. + let sz = cmp::min(1024, size - total); + input.push(stream.by_ref().take(sz).collect::().to_tendril()); + total += sz; + } + + Bench { + input: input, + clone_only: clone_only, + opts: opts, + } + } +} + +impl TDynBenchFn for Bench { + fn run(&self, bh: &mut Bencher) { + bh.iter(|| { + let input = self.input.clone(); + if self.clone_only { + // Because the tokenizer consumes its buffers, we need + // to clone inside iter(). We can benchmark this + // separately and subtract it out. + // + // See rust-lang/rust#18043. + black_box(input); + } else { + let mut tok = Tokenizer::new(Sink, self.opts.clone()); + for buf in input.into_iter() { + tok.feed(buf); + } + tok.end(); + } + }); + } +} + +fn make_bench(name: &str, size: Option, clone_only: bool, + opts: TokenizerOpts) -> TestDescAndFn { + TestDescAndFn { + desc: TestDesc { + name: DynTestName([ + "tokenize ".to_string(), + name.to_string(), + size.map_or("".to_string(), |s| format!(" size {:7}", s)), + (if clone_only { " (clone only)" } else { "" }).to_string(), + (if opts.exact_errors { " (exact errors)" } else { "" }).to_string(), + ].concat().to_string()), + ignore: false, + should_panic: No, + }, + testfn: DynBenchFn(Box::new(Bench::new(name, size, clone_only, opts))), + } +} + +fn tests() -> Vec { + let mut tests = vec!(make_bench("lipsum.html", Some(1024*1024), true, Default::default())); + + let mut opts_vec = vec!(Default::default()); + if env::var("BENCH_EXACT_ERRORS").is_ok() { + opts_vec.push(TokenizerOpts { + exact_errors: true, + .. Default::default() + }); + } + + for opts in opts_vec.iter() { + for &file in ["lipsum.html", "lipsum-zh.html", "strong.html"].iter() { + for &sz in [1024, 1024*1024].iter() { + tests.push(make_bench(file, Some(sz), false, opts.clone())); + } + } + + for &file in ["tiny-fragment.html", "small-fragment.html", "medium-fragment.html"].iter() { + tests.push(make_bench(file, None, false, opts.clone())); + } + + if env::var("BENCH_UNCOMMITTED").is_ok() { + // Not checked into the repo, so don't include by default. + for &file in ["sina.com.cn.html", "wikipedia.html"].iter() { + let name = format!("uncommitted/{}", file); + tests.push(make_bench(&name, None, false, opts.clone())); + } + } + } + tests +} + +fn main() { + let args: Vec<_> = env::args().collect(); + test::test_main(&args, tests()); +} diff --git a/html5ever-2016-08-25/build.rs b/html5ever-2016-08-25/build.rs new file mode 100644 index 000000000..b01463891 --- /dev/null +++ b/html5ever-2016-08-25/build.rs @@ -0,0 +1,131 @@ +// Copyright 2014-2015 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate phf_codegen; +extern crate rustc_serialize; + +use rustc_serialize::json::{Json, Decoder}; +use rustc_serialize::Decodable; +use std::collections::HashMap; +use std::env; +use std::fs::File; +use std::io::Write; +use std::path::Path; + +fn main() { + let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); + + let rules_rs = Path::new(&manifest_dir).join("src/tree_builder/rules.rs"); + expand_match_tokens( + &rules_rs, + // Keep the expanded file in the source directory, so that `cargo publish` ships it. + &rules_rs.with_extension("expanded.rs")); + + named_entities_to_phf( + &Path::new(&manifest_dir).join("data/entities.json"), + &Path::new(&env::var("OUT_DIR").unwrap()).join("named_entities.rs")); + + println!("cargo:rerun-if-changed={}", rules_rs.display()); +} + +#[cfg(feature = "codegen")] +fn expand_match_tokens(from: &Path, to: &Path) { + extern crate html5ever_macros; + + html5ever_macros::pre_expand(from, to); +} + +#[cfg(not(feature = "codegen"))] +fn expand_match_tokens(from: &Path, to: &Path) { + use std::io::stderr; + use std::process::exit; + + if let Err(error) = check_hash(from, to) { + writeln!( + stderr(), + r" +{} is missing or not up to date with {}: +{} + +Run `cargo build --features codegen` to update it. + +If you’re using html5ever as a dependency, this is a bad release. +Please file an issue at https://github.com/servo/html5ever/issues/new +with the output of `cargo pkgid html5ever`. +", + to.file_name().unwrap().to_string_lossy(), + from.file_name().unwrap().to_string_lossy(), + error + ).unwrap(); + exit(1); + } +} + +#[cfg(not(feature = "codegen"))] +fn check_hash(from: &Path, to: &Path) -> Result<(), String> { + use std::hash::{Hash, Hasher, SipHasher}; + use std::io::Read; + + // Unwrap here as the source file is expected to exist. + let mut file_from = File::open(from).unwrap(); + let mut source = String::new(); + let mut hasher = SipHasher::new(); + file_from.read_to_string(&mut source).unwrap(); + source.hash(&mut hasher); + let source_hash = hasher.finish(); + + // IO errors from here indicate we need to regenerate the expanded file. + let mut file_to = try!(File::open(to).map_err(|e| e.to_string())); + let mut expanded = String::new(); + try!(file_to.read_to_string(&mut expanded).map_err(|e| e.to_string())); + let prefix = "// source SipHash: "; + let line = try!(expanded.lines().find(|line| line.starts_with(prefix)) + .ok_or("source hash not found".to_string())); + let expected_hash = try!(line[prefix.len()..].parse::().map_err(|e| e.to_string())); + if source_hash == expected_hash { + Ok(()) + } else { + Err("different hash".to_string()) + } +} + +fn named_entities_to_phf(from: &Path, to: &Path) { + // A struct matching the entries in entities.json. + #[derive(RustcDecodable)] + struct CharRef { + codepoints: Vec, + //characters: String, // Present in the file but we don't need it + } + + let json = Json::from_reader(&mut File::open(from).unwrap()).unwrap(); + let entities: HashMap = Decodable::decode(&mut Decoder::new(json)).unwrap(); + let mut entities: HashMap<&str, (u32, u32)> = entities.iter().map(|(name, char_ref)| { + assert!(name.starts_with("&")); + assert!(char_ref.codepoints.len() <= 2); + (&name[1..], (char_ref.codepoints[0], *char_ref.codepoints.get(1).unwrap_or(&0))) + }).collect(); + + // Add every missing prefix of those keys, mapping to NULL characters. + for key in entities.keys().cloned().collect::>() { + for n in 1 .. key.len() { + entities.entry(&key[..n]).or_insert((0, 0)); + } + } + entities.insert("", (0, 0)); + + let mut phf_map = phf_codegen::Map::new(); + for (key, value) in entities { + phf_map.entry(key, &format!("{:?}", value)); + } + + let mut file = File::create(to).unwrap(); + write!(&mut file, "pub static NAMED_ENTITIES: Map<&'static str, (u32, u32)> = ").unwrap(); + phf_map.build(&mut file).unwrap(); + write!(&mut file, ";\n").unwrap(); +} diff --git a/html5ever-2015-05-15/data/bench/lipsum-zh.html b/html5ever-2016-08-25/data/bench/lipsum-zh.html similarity index 100% rename from html5ever-2015-05-15/data/bench/lipsum-zh.html rename to html5ever-2016-08-25/data/bench/lipsum-zh.html diff --git a/html5ever-2015-05-15/data/bench/lipsum.html b/html5ever-2016-08-25/data/bench/lipsum.html similarity index 100% rename from html5ever-2015-05-15/data/bench/lipsum.html rename to html5ever-2016-08-25/data/bench/lipsum.html diff --git a/html5ever-2015-05-15/data/bench/medium-fragment.html b/html5ever-2016-08-25/data/bench/medium-fragment.html similarity index 100% rename from html5ever-2015-05-15/data/bench/medium-fragment.html rename to html5ever-2016-08-25/data/bench/medium-fragment.html diff --git a/html5ever-2015-05-15/data/bench/small-fragment.html b/html5ever-2016-08-25/data/bench/small-fragment.html similarity index 100% rename from html5ever-2015-05-15/data/bench/small-fragment.html rename to html5ever-2016-08-25/data/bench/small-fragment.html diff --git a/html5ever-2015-05-15/data/bench/strong.html b/html5ever-2016-08-25/data/bench/strong.html similarity index 100% rename from html5ever-2015-05-15/data/bench/strong.html rename to html5ever-2016-08-25/data/bench/strong.html diff --git a/html5ever-2015-05-15/data/bench/tiny-fragment.html b/html5ever-2016-08-25/data/bench/tiny-fragment.html similarity index 100% rename from html5ever-2015-05-15/data/bench/tiny-fragment.html rename to html5ever-2016-08-25/data/bench/tiny-fragment.html diff --git a/html5ever-2015-05-15/data/entities.json b/html5ever-2016-08-25/data/entities.json similarity index 100% rename from html5ever-2015-05-15/data/entities.json rename to html5ever-2016-08-25/data/entities.json diff --git a/html5ever-2016-08-25/data/test/ignore b/html5ever-2016-08-25/data/test/ignore new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/html5ever-2016-08-25/data/test/ignore @@ -0,0 +1 @@ + diff --git a/html5ever-2016-08-25/examples/capi/tokenize.c b/html5ever-2016-08-25/examples/capi/tokenize.c new file mode 100644 index 000000000..3d0776346 --- /dev/null +++ b/html5ever-2016-08-25/examples/capi/tokenize.c @@ -0,0 +1,74 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#include + +#include "html5ever.h" + +void put_str(const char *x) { + fputs(x, stdout); +} + +void put_buf(struct h5e_buf text) { + fwrite(text.data, text.len, 1, stdout); +} + +void do_chars(void *user, struct h5e_buf text) { + put_str("CHARS : "); + put_buf(text); + put_str("\n"); +} + +void do_start_tag(void *user, struct h5e_buf name, int self_closing, size_t num_attrs) { + put_str("TAG : <"); + put_buf(name); + if (self_closing) { + putchar('/'); + } + put_str(">\n"); +} + +void do_tag_attr(void *user, struct h5e_buf name, struct h5e_buf value) { + put_str(" ATTR: "); + put_buf(name); + put_str("=\""); + put_buf(value); + put_str("\"\n"); +} + +void do_end_tag(void *user, struct h5e_buf name) { + put_str("TAG : \n"); +} + +struct h5e_token_ops ops = { + .do_chars = do_chars, + .do_start_tag = do_start_tag, + .do_tag_attr = do_tag_attr, + .do_end_tag = do_end_tag, +}; + +struct h5e_token_sink sink = { + .ops = &ops, + .user = NULL, +}; + +int main(int argc, char *argv[]) { + if (argc < 2) { + printf("Usage: %s 'HTML fragment'\n", argv[0]); + return 1; + } + + struct h5e_tokenizer *tok = h5e_tokenizer_new(&sink); + h5e_tokenizer_feed(tok, h5e_buf_from_cstr(argv[1])); + h5e_tokenizer_end(tok); + h5e_tokenizer_free(tok); + return 0; +} diff --git a/html5ever-2016-08-25/examples/html2html.rs b/html5ever-2016-08-25/examples/html2html.rs new file mode 100644 index 000000000..d5c73e0a1 --- /dev/null +++ b/html5ever-2016-08-25/examples/html2html.rs @@ -0,0 +1,50 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Parse and re-serialize a HTML5 document. +//! +//! This is meant to produce the exact same output (ignoring stderr) as +//! +//! java -classpath htmlparser-1.4.jar nu.validator.htmlparser.tools.HTML2HTML +//! +//! where htmlparser-1.4.jar comes from http://about.validator.nu/htmlparser/ + +extern crate tendril; +extern crate html5ever; + +use std::io::{self, Write}; +use std::default::Default; + +use tendril::TendrilSink; + +use html5ever::driver::ParseOpts; +use html5ever::tree_builder::TreeBuilderOpts; +use html5ever::{parse_document, serialize}; +use html5ever::rcdom::RcDom; + +fn main() { + let opts = ParseOpts { + tree_builder: TreeBuilderOpts { + drop_doctype: true, + ..Default::default() + }, + ..Default::default() + }; + let stdin = io::stdin(); + let dom = parse_document(RcDom::default(), opts) + .from_utf8() + .read_from(&mut stdin.lock()) + .unwrap(); + + // The validator.nu HTML2HTML always prints a doctype at the very beginning. + io::stdout().write_all(b"\n") + .ok().expect("writing DOCTYPE failed"); + serialize(&mut io::stdout(), &dom.document, Default::default()) + .ok().expect("serialization failed"); +} diff --git a/html5ever-2016-08-25/examples/noop-tokenize.rs b/html5ever-2016-08-25/examples/noop-tokenize.rs new file mode 100644 index 000000000..1310d54c7 --- /dev/null +++ b/html5ever-2016-08-25/examples/noop-tokenize.rs @@ -0,0 +1,40 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// Run a single benchmark once. For use with profiling tools. + +extern crate html5ever; +extern crate tendril; + +use std::io; +use std::default::Default; + +use tendril::{ByteTendril, ReadExt}; + +use html5ever::tokenizer::{TokenSink, Token, Tokenizer}; + +struct Sink(Vec); + +impl TokenSink for Sink { + fn process_token(&mut self, token: Token) { + // Don't use the token, but make sure we don't get + // optimized out entirely. + self.0.push(token); + } +} + +fn main() { + let mut input = ByteTendril::new(); + io::stdin().read_to_tendril(&mut input).unwrap(); + let input = input.try_reinterpret().unwrap(); + + let mut tok = Tokenizer::new(Sink(Vec::new()), Default::default()); + tok.feed(input); + tok.end(); +} diff --git a/html5ever-2016-08-25/examples/noop-tree-builder.rs b/html5ever-2016-08-25/examples/noop-tree-builder.rs new file mode 100644 index 000000000..006ed1a0f --- /dev/null +++ b/html5ever-2016-08-25/examples/noop-tree-builder.rs @@ -0,0 +1,106 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#[macro_use] +extern crate string_cache; +extern crate tendril; +extern crate html5ever; + +use std::io; +use std::default::Default; +use std::collections::HashMap; +use std::borrow::Cow; +use string_cache::QualName; + +use tendril::{StrTendril, TendrilSink}; + +use html5ever::parse_document; +use html5ever::tokenizer::Attribute; +use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText}; + +struct Sink { + next_id: usize, + names: HashMap, +} + +impl Sink { + fn get_id(&mut self) -> usize { + let id = self.next_id; + self.next_id += 2; + id + } +} + +impl TreeSink for Sink { + type Handle = usize; + type Output = Self; + fn finish(self) -> Self { self } + + fn get_document(&mut self) -> usize { + 0 + } + + fn get_template_contents(&mut self, target: usize) -> usize { + if let Some(&qualname!(html, "template")) = self.names.get(&target) { + target + 1 + } else { + panic!("not a template element") + } + } + + fn same_node(&self, x: usize, y: usize) -> bool { + x == y + } + + fn elem_name(&self, target: usize) -> QualName { + self.names.get(&target).expect("not an element").clone() + } + + fn create_element(&mut self, name: QualName, _attrs: Vec) -> usize { + let id = self.get_id(); + self.names.insert(id, name); + id + } + + fn create_comment(&mut self, _text: StrTendril) -> usize { + self.get_id() + } + + fn append_before_sibling(&mut self, + _sibling: usize, + _new_node: NodeOrText) -> Result<(), NodeOrText> { + // `sibling` will have a parent unless a script moved it, and we're + // not running scripts. Therefore we can aways return `Ok(())`. + Ok(()) + } + + fn parse_error(&mut self, _msg: Cow<'static, str>) { } + fn set_quirks_mode(&mut self, _mode: QuirksMode) { } + fn append(&mut self, _parent: usize, _child: NodeOrText) { } + + fn append_doctype_to_document(&mut self, _: StrTendril, _: StrTendril, _: StrTendril) { } + fn add_attrs_if_missing(&mut self, target: usize, _attrs: Vec) { + assert!(self.names.contains_key(&target), "not an element"); + } + fn remove_from_parent(&mut self, _target: usize) { } + fn reparent_children(&mut self, _node: usize, _new_parent: usize) { } + fn mark_script_already_started(&mut self, _node: usize) { } +} + +fn main() { + let sink = Sink { + next_id: 1, + names: HashMap::new(), + }; + let stdin = io::stdin(); + parse_document(sink, Default::default()) + .from_utf8() + .read_from(&mut stdin.lock()) + .unwrap(); +} diff --git a/html5ever-2016-08-25/examples/print-rcdom.rs b/html5ever-2016-08-25/examples/print-rcdom.rs new file mode 100644 index 000000000..016a9595d --- /dev/null +++ b/html5ever-2016-08-25/examples/print-rcdom.rs @@ -0,0 +1,79 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate html5ever; + +#[macro_use] +extern crate string_cache; +extern crate tendril; + +use std::io::{self, Read}; +use std::iter::repeat; +use std::default::Default; +use std::string::String; + +use tendril::TendrilSink; +use html5ever::parse_document; +use html5ever::rcdom::{Document, Doctype, Text, Comment, Element, RcDom, Handle}; + +// This is not proper HTML serialization, of course. + +fn walk(indent: usize, handle: Handle) { + let node = handle.borrow(); + // FIXME: don't allocate + print!("{}", repeat(" ").take(indent).collect::()); + match node.node { + Document + => println!("#Document"), + + Doctype(ref name, ref public, ref system) + => println!("", *name, *public, *system), + + Text(ref text) + => println!("#text: {}", escape_default(text)), + + Comment(ref text) + => println!("", escape_default(text)), + + Element(ref name, _, ref attrs) => { + assert!(name.ns == ns!(html)); + print!("<{}", name.local); + for attr in attrs.iter() { + assert!(attr.name.ns == ns!()); + print!(" {}=\"{}\"", attr.name.local, attr.value); + } + println!(">"); + } + } + + for child in node.children.iter() { + walk(indent+4, child.clone()); + } +} + +// FIXME: Copy of str::escape_default from std, which is currently unstable +pub fn escape_default(s: &str) -> String { + s.chars().flat_map(|c| c.escape_default()).collect() +} + +fn main() { + let stdin = io::stdin(); + let dom = parse_document(RcDom::default(), Default::default()) + .from_utf8() + .read_from(&mut stdin.lock()) + .unwrap(); + walk(0, dom.document); + + if !dom.errors.is_empty() { + println!("\nParse errors:"); + for err in dom.errors.into_iter() { + println!(" {}", err); + } + } +} diff --git a/html5ever-2016-08-25/examples/print-tree-actions.rs b/html5ever-2016-08-25/examples/print-tree-actions.rs new file mode 100644 index 000000000..19c37159e --- /dev/null +++ b/html5ever-2016-08-25/examples/print-tree-actions.rs @@ -0,0 +1,153 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#[macro_use] +extern crate string_cache; +extern crate tendril; +extern crate html5ever; + +use std::io; +use std::default::Default; +use std::collections::HashMap; +use std::borrow::Cow; +use string_cache::QualName; + +use tendril::{StrTendril, TendrilSink}; + +use html5ever::tokenizer::Attribute; +use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText}; +use html5ever::parse_document; + +struct Sink { + next_id: usize, + names: HashMap, +} + +impl Sink { + fn get_id(&mut self) -> usize { + let id = self.next_id; + self.next_id += 2; + id + } +} + +impl TreeSink for Sink { + type Handle = usize; + type Output = Self; + fn finish(self) -> Self { self } + + fn parse_error(&mut self, msg: Cow<'static, str>) { + println!("Parse error: {}", msg); + } + + fn get_document(&mut self) -> usize { + 0 + } + + fn get_template_contents(&mut self, target: usize) -> usize { + if let Some(&qualname!(html, "template")) = self.names.get(&target) { + target + 1 + } else { + panic!("not a template element") + } + } + + fn set_quirks_mode(&mut self, mode: QuirksMode) { + println!("Set quirks mode to {:?}", mode); + } + + fn same_node(&self, x: usize, y: usize) -> bool { + x == y + } + + fn elem_name(&self, target: usize) -> QualName { + self.names.get(&target).expect("not an element").clone() + } + + fn create_element(&mut self, name: QualName, _attrs: Vec) -> usize { + let id = self.get_id(); + println!("Created {:?} as {}", name, id); + self.names.insert(id, name); + id + } + + fn create_comment(&mut self, text: StrTendril) -> usize { + let id = self.get_id(); + println!("Created comment \"{}\" as {}", escape_default(&text), id); + id + } + + fn append(&mut self, parent: usize, child: NodeOrText) { + match child { + AppendNode(n) + => println!("Append node {} to {}", n, parent), + AppendText(t) + => println!("Append text to {}: \"{}\"", parent, escape_default(&t)), + } + } + + fn append_before_sibling(&mut self, + sibling: usize, + new_node: NodeOrText) -> Result<(), NodeOrText> { + match new_node { + AppendNode(n) + => println!("Append node {} before {}", n, sibling), + AppendText(t) + => println!("Append text before {}: \"{}\"", sibling, escape_default(&t)), + } + + // `sibling` will have a parent unless a script moved it, and we're + // not running scripts. Therefore we can aways return `Ok(())`. + Ok(()) + } + + fn append_doctype_to_document(&mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril) { + println!("Append doctype: {} {} {}", name, public_id, system_id); + } + + fn add_attrs_if_missing(&mut self, target: usize, attrs: Vec) { + assert!(self.names.contains_key(&target), "not an element"); + println!("Add missing attributes to {}:", target); + for attr in attrs.into_iter() { + println!(" {:?} = {}", attr.name, attr.value); + } + } + + fn remove_from_parent(&mut self, target: usize) { + println!("Remove {} from parent", target); + } + + fn reparent_children(&mut self, node: usize, new_parent: usize) { + println!("Move children from {} to {}", node, new_parent); + } + + fn mark_script_already_started(&mut self, node: usize) { + println!("Mark script {} as already started", node); + } +} + +// FIXME: Copy of str::escape_default from std, which is currently unstable +pub fn escape_default(s: &str) -> String { + s.chars().flat_map(|c| c.escape_default()).collect() +} + +fn main() { + let sink = Sink { + next_id: 1, + names: HashMap::new(), + }; + let stdin = io::stdin(); + parse_document(sink, Default::default()) + .from_utf8() + .read_from(&mut stdin.lock()) + .unwrap(); +} diff --git a/html5ever-2016-08-25/examples/tokenize.rs b/html5ever-2016-08-25/examples/tokenize.rs new file mode 100644 index 000000000..08dd1ae61 --- /dev/null +++ b/html5ever-2016-08-25/examples/tokenize.rs @@ -0,0 +1,94 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate tendril; +extern crate html5ever; + +use std::io::{self, Read}; +use std::default::Default; + +use tendril::{ByteTendril, ReadExt}; + +use html5ever::tokenizer::{TokenSink, Tokenizer, Token, TokenizerOpts, ParseError}; +use html5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken, StartTag, EndTag}; + +#[derive(Copy, Clone)] +struct TokenPrinter { + in_char_run: bool, +} + +impl TokenPrinter { + fn is_char(&mut self, is_char: bool) { + match (self.in_char_run, is_char) { + (false, true ) => print!("CHAR : \""), + (true, false) => println!("\""), + _ => (), + } + self.in_char_run = is_char; + } + + fn do_char(&mut self, c: char) { + self.is_char(true); + print!("{}", c.escape_default().collect::()); + } +} + +impl TokenSink for TokenPrinter { + fn process_token(&mut self, token: Token) { + match token { + CharacterTokens(b) => { + for c in b.chars() { + self.do_char(c); + } + } + NullCharacterToken => self.do_char('\0'), + TagToken(tag) => { + self.is_char(false); + // This is not proper HTML serialization, of course. + match tag.kind { + StartTag => print!("TAG : <\x1b[32m{}\x1b[0m", tag.name), + EndTag => print!("TAG : <\x1b[31m/{}\x1b[0m", tag.name), + } + for attr in tag.attrs.iter() { + print!(" \x1b[36m{}\x1b[0m='\x1b[34m{}\x1b[0m'", + attr.name.local, attr.value); + } + if tag.self_closing { + print!(" \x1b[31m/\x1b[0m"); + } + println!(">"); + } + ParseError(err) => { + self.is_char(false); + println!("ERROR: {}", err); + } + _ => { + self.is_char(false); + println!("OTHER: {:?}", token); + } + } + } +} + +fn main() { + let mut sink = TokenPrinter { + in_char_run: false, + }; + let mut input = ByteTendril::new(); + io::stdin().read_to_tendril(&mut input).unwrap(); + let input = input.try_reinterpret().unwrap(); + + let mut tok = Tokenizer::new(sink, TokenizerOpts { + profile: true, + .. Default::default() + }); + tok.feed(input); + tok.end(); + sink.is_char(false); +} diff --git a/html5ever-2016-08-25/macros/Cargo.toml b/html5ever-2016-08-25/macros/Cargo.toml new file mode 100644 index 000000000..da6a9e254 --- /dev/null +++ b/html5ever-2016-08-25/macros/Cargo.toml @@ -0,0 +1,15 @@ +[package] + +name = "html5ever_macros" +version = "0.2.6" +authors = [ "The html5ever Project Developers" ] +license = "MIT / Apache-2.0" +repository = "https://github.com/servo/html5ever" +description = "High-performance browser-grade HTML5 parser − compiler plugins" + +[lib] +name = "html5ever_macros" +plugin = true + +[dependencies] +mac = "0" diff --git a/html5ever-2015-05-15/macros/src/lib.rs b/html5ever-2016-08-25/macros/src/lib.rs similarity index 59% rename from html5ever-2015-05-15/macros/src/lib.rs rename to html5ever-2016-08-25/macros/src/lib.rs index 3fefe2dba..f374f28f4 100644 --- a/html5ever-2015-05-15/macros/src/lib.rs +++ b/html5ever-2016-08-25/macros/src/lib.rs @@ -7,26 +7,18 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![crate_name="html5ever_macros"] -#![crate_type="dylib"] - -#![feature(plugin_registrar, quote, rustc_private, slice_patterns)] +#![feature(quote, rustc_private)] #![deny(warnings)] extern crate syntax; -extern crate rustc; -extern crate rustc_plugin; -extern crate rustc_serialize; #[macro_use] extern crate mac; -use rustc_plugin::Registry; - // See https://github.com/rust-lang/rust/pull/23857 macro_rules! panictry { ($e:expr) => ({ - use syntax::errors::FatalError; + use syntax::diagnostic::FatalError; match $e { Ok(e) => e, Err(FatalError) => panic!(FatalError) @@ -35,12 +27,7 @@ macro_rules! panictry { } // Make these public so that rustdoc will generate documentation for them. -pub mod named_entities; pub mod match_token; +pub mod pre_expand; -// NB: This needs to be public or we get a linker error. -#[plugin_registrar] -pub fn plugin_registrar(reg: &mut Registry) { - reg.register_macro("named_entities", named_entities::expand); - reg.register_macro("match_token", match_token::expand); -} +pub use pre_expand::pre_expand; diff --git a/html5ever-2015-05-15/macros/src/match_token.rs b/html5ever-2016-08-25/macros/src/match_token.rs similarity index 87% rename from html5ever-2015-05-15/macros/src/match_token.rs rename to html5ever-2016-08-25/macros/src/match_token.rs index 7a3da4e90..0e9a3237b 100644 --- a/html5ever-2015-05-15/macros/src/match_token.rs +++ b/html5ever-2016-08-25/macros/src/match_token.rs @@ -67,8 +67,8 @@ tag @ => ... expands to something like ```rust -TagToken(tag @ Tag { name: atom!(html), kind: StartTag }) -| TagToken(tag @ Tag { name: atom!(head), kind: StartTag }) => ... +TagToken(tag @ Tag { name: atom!("html"), kind: StartTag }) +| TagToken(tag @ Tag { name: atom!("head"), kind: StartTag }) => ... ``` A wildcard tag matches any tag of the appropriate kind, *unless* it was @@ -100,7 +100,7 @@ matching, by enforcing the following restrictions on its input: use std::collections::{HashSet, HashMap}; use std::collections::hash_map::Entry::{Occupied, Vacant}; -use syntax::errors::FatalError; +use syntax::diagnostic::FatalError; use syntax::ptr::P; use syntax::codemap::{Span, Spanned, spanned}; use syntax::ast; @@ -115,8 +115,6 @@ use self::RHS::{Else, Expr}; type Tokens = Vec; -type TagName = ast::Ident; - // FIXME: duplicated in src/tokenizer/interface.rs #[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] enum TagKind { @@ -140,7 +138,7 @@ impl TagKind { #[derive(PartialEq, Eq, Hash, Clone)] struct Tag { kind: TagKind, - name: Option, + name: Option, } /// Left-hand side of a pattern-match arm. @@ -189,7 +187,7 @@ fn parse_tag(parser: &mut Parser) -> Result, FatalError> { }; let name = match try!(parser.eat(&token::Underscore)) { true => None, - false => Some(try!(parser.parse_ident())), + false => Some((*try!(parser.parse_ident()).name.as_str()).to_owned()), }; try!(parser.expect(&token::Gt)); @@ -203,7 +201,7 @@ fn parse_tag(parser: &mut Parser) -> Result, FatalError> { fn parse(cx: &mut ExtCtxt, toks: &[ast::TokenTree]) -> Result { let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(), toks.to_vec()); - let discriminant = try!(parser.parse_expr_res(Restrictions::RESTRICTION_NO_STRUCT_LITERAL, None)); + let discriminant = try!(parser.parse_expr_res(Restrictions::RESTRICTION_NO_STRUCT_LITERAL)); try!(parser.commit_expr_expecting(&*discriminant, token::OpenDelim(token::Brace))); let mut arms: Vec = Vec::new(); @@ -216,7 +214,7 @@ fn parse(cx: &mut ExtCtxt, toks: &[ast::TokenTree]) -> Result let lhs_lo = parser.span.lo; let lhs = match parser.token { - token::Underscore | token::Ident(..) => Pat(parser.parse_pat().unwrap()), + token::Underscore | token::Ident(..) => Pat(try!(parser.parse_pat())), token::Lt => { let mut tags = Vec::new(); while parser.token != token::FatArrow { @@ -236,7 +234,7 @@ fn parse(cx: &mut ExtCtxt, toks: &[ast::TokenTree]) -> Result try!(parser.expect(&token::Comma)); Else } else { - let expr = try!(parser.parse_expr_res(Restrictions::RESTRICTION_STMT_EXPR, None)); + let expr = try!(parser.parse_expr_res(Restrictions::RESTRICTION_STMT_EXPR)); rhs_hi = parser.last_span.hi; let require_comma = @@ -292,14 +290,24 @@ fn make_tag_pattern(cx: &mut ExtCtxt, binding: Tokens, tag: Tag) -> Tokens { ) } +macro_rules! ext_err { + ($span: expr, $message: expr) => { return Err(($span, $message)) } +} +macro_rules! ext_err_if { + ($condition: expr, $span: expr, $message: expr) => { + if $condition { return Err(($span, $message)) } + } +} + /// Expand the `match_token!` macro. -pub fn expand(cx: &mut ExtCtxt, span: Span, toks: &[ast::TokenTree]) -> Box { +pub fn expand_to_tokens(cx: &mut ExtCtxt, span: Span, toks: &[ast::TokenTree]) + -> Result, (Span, &'static str)> { let Match { discriminant, mut arms } = panictry!(parse(cx, toks)); // Handle the last arm specially at the end. let last_arm = match arms.pop() { Some(x) => x, - None => ext_bail!(cx, span, "need at least one match arm"), + None => ext_err!(span, "need at least one match arm"), }; // Code for the arms other than the last one. @@ -324,11 +332,11 @@ pub fn expand(cx: &mut ExtCtxt, span: Span, toks: &[ast::TokenTree]) -> Box ext_bail!(cx, rhs.span, "'else' may not appear with an ordinary pattern"), + => ext_err!(rhs.span, "'else' may not appear with an ordinary pattern"), // ordinary pattern => expression (Pat(pat), Expr(expr)) => { - ext_bail_if!(!wildcards.is_empty(), cx, lhs.span, + ext_err_if!(!wildcards.is_empty(), lhs.span, "ordinary patterns may not appear after wildcard tags"); push_all(&mut arm_code, quote_tokens!(&mut *cx, $binding $pat => $expr,)); } @@ -336,8 +344,8 @@ pub fn expand(cx: &mut ExtCtxt, span: Span, toks: &[ast::TokenTree]) -> Box ... => else (Tags(tags), Else) => { for Spanned { span, node: tag } in tags.into_iter() { - ext_bail_if!(!seen_tags.insert(tag.clone()), cx, span, "duplicate tag"); - ext_bail_if!(tag.name.is_none(), cx, rhs.span, + ext_err_if!(!seen_tags.insert(tag.clone()), span, "duplicate tag"); + ext_err_if!(tag.name.is_none(), rhs.span, "'else' may not appear with a wildcard tag"); match wild_excluded.entry(tag.kind) { Occupied(e) => { e.into_mut().push(tag.clone()); } @@ -353,15 +361,15 @@ pub fn expand(cx: &mut ExtCtxt, span: Span, toks: &[ast::TokenTree]) -> Box Some(_) => { - ext_bail_if!(!wildcards.is_empty(), cx, lhs.span, + ext_err_if!(!wildcards.is_empty(), lhs.span, "specific tags may not appear after wildcard tags"); - ext_bail_if!(wildcard == Some(true), cx, span, + ext_err_if!(wildcard == Some(true), span, "wildcard tags must appear alone"); if wildcard.is_some() { @@ -375,7 +383,7 @@ pub fn expand(cx: &mut ExtCtxt, span: Span, toks: &[ast::TokenTree]) -> Box None => { - ext_bail_if!(wildcard.is_some(), cx, span, + ext_err_if!(wildcard.is_some(), span, "wildcard tags must appear alone"); wildcard = Some(true); wildcards.push(WildcardArm { @@ -388,7 +396,7 @@ pub fn expand(cx: &mut ExtCtxt, span: Span, toks: &[ast::TokenTree]) -> Box ext_bail!(cx, lhs.span, "[internal macro error] tag arm with no tags"), + None => ext_err!(lhs.span, "[internal macro error] tag arm with no tags"), Some(false) => { push_all(&mut arm_code, quote_tokens!(&mut *cx, => $expr,)); } @@ -402,8 +410,8 @@ pub fn expand(cx: &mut ExtCtxt, span: Span, toks: &[ast::TokenTree]) -> Box { // let enable_wildcards = match last_arm_token { - // TagToken(Tag { kind: EndTag, name: atom!(body), .. }) => false, - // TagToken(Tag { kind: EndTag, name: atom!(html), .. }) => false, + // TagToken(Tag { kind: EndTag, name: atom!("body"), .. }) => false, + // TagToken(Tag { kind: EndTag, name: atom!("html"), .. }) => false, // // ... // _ => true, // }; @@ -424,12 +432,12 @@ pub fn expand(cx: &mut ExtCtxt, span: Span, toks: &[ast::TokenTree]) -> Box ext_bail!(cx, id.span, "the last arm cannot have an @-binding"), - (None, Tags(_), _) => ext_bail!(cx, lhs.span, "the last arm cannot have tag patterns"), - (None, _, Else) => ext_bail!(cx, rhs.span, "the last arm cannot use 'else'"), + (Some(id), _, _) => ext_err!(id.span, "the last arm cannot have an @-binding"), + (None, Tags(_), _) => ext_err!(lhs.span, "the last arm cannot have tag patterns"), + (None, _, Else) => ext_err!(rhs.span, "the last arm cannot use 'else'"), (None, Pat(p), Expr(e)) => match p.node { ast::PatWild | ast::PatIdent(..) => (p, e), - _ => ext_bail!(cx, lhs.span, "the last arm must have a wildcard or ident pattern"), + _ => ext_err!(lhs.span, "the last arm must have a wildcard or ident pattern"), }, }; @@ -456,7 +464,7 @@ pub fn expand(cx: &mut ExtCtxt, span: Span, toks: &[ast::TokenTree]) -> Box or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use match_token; +use std::fs::File; +use std::hash::{Hash, Hasher, SipHasher}; +use std::io::{Read, Write}; +use std::path::Path; +use std::rc::Rc; +use syntax::{ast, codemap, ext, parse, print}; +use syntax::parse::token; + +pub fn pre_expand(from: &Path, to: &Path) { + let mut source = String::new(); + let mut file_from = File::open(from).unwrap(); + file_from.read_to_string(&mut source).unwrap(); + + let mut file_to = File::create(to).unwrap(); + write_header(&from, &source, &mut file_to); + + let sess = parse::ParseSess::new(); + let mut feature_gated_cfgs = Vec::new(); + let mut cx = ext::base::ExtCtxt::new(&sess, vec![], + ext::expand::ExpansionConfig::default("".to_owned()), + &mut feature_gated_cfgs); + + let from = from.to_string_lossy().into_owned(); + let tts = parse::parse_tts_from_source_str(from, source, vec![], &sess); + let tts = find_and_expand_match_token(&mut cx, tts); + let tts = pretty(&mut cx, tts); + + let expanded = print::pprust::tts_to_string(&tts); + file_to.write_all(expanded.as_bytes()).unwrap(); +} + +fn find_and_expand_match_token(cx: &mut ext::base::ExtCtxt, tts: Vec) + -> Vec { + let mut expanded = Vec::new(); + let mut tts = tts.into_iter().peekable(); + while let Some(tt) = tts.next() { + match tt { + ast::TokenTree::Token(span, token::Token::Ident(ident, token::IdentStyle::Plain)) + if ident.name.as_str() == "match_token" + => { + // `!` + if !matches!(tts.next(), Some(ast::TokenTree::Token(_, token::Token::Not))) { + expanded.push(tt); + continue + } + match tts.next() { + Some(ast::TokenTree::Delimited(_, block)) => { + cx.bt_push(expn_info(span)); + expanded.extend( + match match_token::expand_to_tokens(cx, span, &block.tts) { + Ok(tts) => tts, + Err((span, message)) => { + cx.parse_sess.span_diagnostic.span_err(span, message); + panic!("Error in match_token! expansion."); + } + }); + cx.bt_pop(); + } + _ => panic!("expected a block after {:?}", span) + } + } + ast::TokenTree::Delimited(span, mut block) => { + Rc::make_mut(&mut block); + let block = Rc::try_unwrap(block).unwrap(); + expanded.push(ast::TokenTree::Delimited(span, Rc::new(ast::Delimited { + delim: block.delim, + open_span: block.open_span, + tts: find_and_expand_match_token(cx, block.tts), + close_span: block.close_span, + }))) + } + _ => expanded.push(tt) + } + } + expanded +} + +fn expn_info(span: codemap::Span) -> codemap::ExpnInfo { + codemap::ExpnInfo { + call_site: span, + callee: codemap::NameAndSpan { + format: codemap::ExpnFormat::MacroBang(token::intern("match_token")), + allow_internal_unstable: false, + span: None, + } + } +} + +/// Somehow, going through a parser and back to tokens gives nicer whitespace. +fn pretty(cx: &mut ext::base::ExtCtxt, tts: Vec) -> Vec { + let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(), tts); + let start_span = parser.span; + let mut items = Vec::new(); + let attrs = parser.parse_inner_attributes().unwrap(); + while let Ok(Some(item)) = parser.parse_item() { + items.push(item) + } + cx.bt_push(expn_info(start_span)); + quote_tokens!(&mut *cx, $attrs $items) +} + +fn write_header(source_file_name: &Path, source: &str, file: &mut File) { + let mut hasher = SipHasher::new(); + source.hash(&mut hasher); + let source_hash = hasher.finish(); + + for header_line in source.lines().take_while(|line| line.starts_with("//")) { + writeln!(file, "{}", header_line).unwrap(); + } + writeln!(file, r" +// This file is generated from {} +// source SipHash: {} +", + source_file_name.file_name().unwrap().to_string_lossy(), source_hash).unwrap(); +} diff --git a/html5ever-2015-05-15/makefile b/html5ever-2016-08-25/makefile similarity index 100% rename from html5ever-2015-05-15/makefile rename to html5ever-2016-08-25/makefile diff --git a/html5ever-2016-08-25/scripts/bench-branches.py b/html5ever-2016-08-25/scripts/bench-branches.py new file mode 100755 index 000000000..c7c8c7cb4 --- /dev/null +++ b/html5ever-2016-08-25/scripts/bench-branches.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# Copyright 2014 The html5ever Project Developers. See the +# COPYRIGHT file at the top-level directory of this distribution. +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +import os +import sys +import simplejson +import subprocess + +if not os.getcwd().endswith('/build'): + sys.stderr.write('Run me from the build directory') + sys.exit(1) + +branches = sys.argv[1:] + +# Prefixing a branch name with '=' means don't re-run that benchmark. +branches_run = [b for b in branches if not b.startswith('=')] +branches = [b.lstrip('=') for b in branches] + +baseline = branches[0] + +for branch in branches_run: + subprocess.check_call( + '''../configure && + git checkout {0:s} && + BENCH_UNCOMMITTED=1 make RUSTFLAGS="-O" METRICS=metrics.{0:s}.json clean bench \ + | tee bench.{0:s}''' + .format(branch), shell=True) + +data = {} +for branch in branches: + with file('metrics.{:s}.json'.format(branch)) as f: + data[branch] = simplejson.load(f) + +keys = data[data.iterkeys().next()].keys() +for branch, dat in data.iteritems(): + if branch == baseline: + continue + for k in keys: + old = data[baseline][k]['value'] + new = dat[k]['value'] + chg = (new - old) / float(old) + desc = 'worse' + if chg < 0: + desc = 'better' + chg = -chg + + print '{:50s}: {:8s} {:6s} by {:5.1f}%'.format( + k, branch, desc, 100*chg) + + print diff --git a/html5ever-2016-08-25/scripts/extract-from-spec.py b/html5ever-2016-08-25/scripts/extract-from-spec.py new file mode 100755 index 000000000..2035b359b --- /dev/null +++ b/html5ever-2016-08-25/scripts/extract-from-spec.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# Copyright 2014 The html5ever Project Developers. See the +# COPYRIGHT file at the top-level directory of this distribution. +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +import re +import bs4 + +# Extract information from the WHATWG webapp spec. + +def parse_spec(): + with file('webapps.html') as f: + soup = bs4.BeautifulSoup(f) + + return { + 'tokenization': soup.find(text='Tokenization').find_parent('div'), + } + +def tokenizer_state_ident(longname): + longname = longname.lower() + assert longname[-5:] == 'state' + words = re.sub(r'[^a-z]', ' ', longname[:-5]).split() + return ''.join(w.title() for w in words) + +def extract_tokenizer_states(spec): + with file('tokenizer/states.rs', 'w') as f: + f.write('pub enum State {\n') + + for statedefn in spec['tokenization'].select('h5 > dfn'): + f.write(' %s,\n' % (tokenizer_state_ident(statedefn.text))) + + f.write('}\n') + +def extract_tokenizer_graph(spec): + with file('build/states.dot', 'w') as f: + f.write('strict digraph {\n') + + for sec in spec['tokenization'].select('h5'): + name = sec.text + if name == 'Tokenizing character references': + continue + ident = tokenizer_state_ident(name) + + txt = '' + for sib in sec.next_siblings: + if isinstance(sib, bs4.Tag): + if sib.name == 'h5': + break + txt += sib.get_text() + else: + txt += sib + + for edge in re.finditer(r'[sS]witch to the (.* state)', txt): + f.write(' %s -> %s;\n' % (ident, tokenizer_state_ident(edge.group(1)))) + + f.write('}\n') + +spec = parse_spec() + +# extract_tokenizer_states(spec) # has manual changes +extract_tokenizer_graph(spec) diff --git a/html5ever-2016-08-25/scripts/travis-build.sh b/html5ever-2016-08-25/scripts/travis-build.sh new file mode 100755 index 000000000..960a9f826 --- /dev/null +++ b/html5ever-2016-08-25/scripts/travis-build.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright 2015 The html5ever Project Developers. See the +# COPYRIGHT file at the top-level directory of this distribution. +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +set -ex + +if [ $TRAVIS_RUST_VERSION = nightly ] +then + cargo test --features "rustc-test/capture" + cargo test --features "rustc-test/capture unstable" +else + cargo test +fi + +cargo doc diff --git a/html5ever-2016-08-25/src/driver.rs b/html5ever-2016-08-25/src/driver.rs new file mode 100644 index 000000000..4e9b5b6a0 --- /dev/null +++ b/html5ever-2016-08-25/src/driver.rs @@ -0,0 +1,329 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! High-level interface to the parser. + +use tokenizer::{Attribute, Tokenizer, TokenizerOpts}; +use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink}; + +use std::borrow::Cow; +use std::mem; + +use encoding::{self, EncodingRef}; +use string_cache::QualName; +use tendril; +use tendril::{StrTendril, ByteTendril}; +use tendril::stream::{TendrilSink, Utf8LossyDecoder, LossyDecoder}; + +/// All-encompassing options struct for the parser. +#[derive(Clone, Default)] +pub struct ParseOpts { + /// Tokenizer options. + pub tokenizer: TokenizerOpts, + + /// Tree builder options. + pub tree_builder: TreeBuilderOpts, +} + +/// Parse an HTML document +/// +/// The returned value implements `tendril::TendrilSink` +/// so that Unicode input may be provided incrementally, +/// or all at once with the `one` method. +/// +/// If your input is bytes, use `Parser::from_utf8` or `Parser::from_bytes`. +pub fn parse_document(sink: Sink, opts: ParseOpts) -> Parser where Sink: TreeSink { + let tb = TreeBuilder::new(sink, opts.tree_builder); + let tok = Tokenizer::new(tb, opts.tokenizer); + Parser { tokenizer: tok } +} + +/// Parse an HTML fragment +/// +/// The returned value implements `tendril::TendrilSink` +/// so that Unicode input may be provided incrementally, +/// or all at once with the `one` method. +/// +/// If your input is bytes, use `Parser::from_utf8` or `Parser::from_bytes`. +pub fn parse_fragment(mut sink: Sink, opts: ParseOpts, + context_name: QualName, context_attrs: Vec) + -> Parser + where Sink: TreeSink { + let context_elem = sink.create_element(context_name, context_attrs); + parse_fragment_for_element(sink, opts, context_elem, None) +} + +/// Like `parse_fragment`, but with an existing context element +/// and optionally a form element. +pub fn parse_fragment_for_element(sink: Sink, opts: ParseOpts, + context_element: Sink::Handle, + form_element: Option) + -> Parser + where Sink: TreeSink { + let tb = TreeBuilder::new_for_fragment(sink, context_element, form_element, opts.tree_builder); + let tok_opts = TokenizerOpts { + initial_state: Some(tb.tokenizer_state_for_context_elem()), + .. opts.tokenizer + }; + let tok = Tokenizer::new(tb, tok_opts); + Parser { tokenizer: tok } +} + +/// An HTML parser, +/// ready to recieve Unicode input through the `tendril::TendrilSink` trait’s methods. +pub struct Parser where Sink: TreeSink { + pub tokenizer: Tokenizer>, +} + +impl TendrilSink for Parser { + fn process(&mut self, t: StrTendril) { + self.tokenizer.feed(t) + } + + // FIXME: Is it too noisy to report every character decoding error? + fn error(&mut self, desc: Cow<'static, str>) { + self.tokenizer.sink_mut().sink_mut().parse_error(desc) + } + + type Output = Sink::Output; + + fn finish(mut self) -> Self::Output { + self.tokenizer.end(); + self.tokenizer.unwrap().unwrap().finish() + } +} + +impl Parser { + /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes. + /// + /// Use this when your input is bytes that are known to be in the UTF-8 encoding. + /// Decoding is lossy, like `String::from_utf8_lossy`. + pub fn from_utf8(self) -> Utf8LossyDecoder { + Utf8LossyDecoder::new(self) + } + + /// Wrap this parser into a `TendrilSink` that accepts bytes + /// and tries to detect the correct character encoding. + /// + /// Currently this looks for a Byte Order Mark, + /// then uses `BytesOpts::transport_layer_encoding`, + /// then falls back to UTF-8. + /// + /// FIXME(https://github.com/servo/html5ever/issues/18): this should look for `` elements + /// and other data per + /// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding + pub fn from_bytes(self, opts: BytesOpts) -> BytesParser { + BytesParser { + state: BytesParserState::Initial { parser: self }, + opts: opts, + } + } +} + +/// Options for choosing a character encoding +#[derive(Clone, Default)] +pub struct BytesOpts { + /// The character encoding specified by the transport layer, if any. + /// In HTTP for example, this is the `charset` parameter of the `Content-Type` response header. + pub transport_layer_encoding: Option, +} + +/// An HTML parser, +/// ready to recieve bytes input through the `tendril::TendrilSink` trait’s methods. +/// +/// See `Parser::from_bytes`. +pub struct BytesParser where Sink: TreeSink { + state: BytesParserState, + opts: BytesOpts, +} + +enum BytesParserState where Sink: TreeSink { + Initial { + parser: Parser, + }, + Buffering { + parser: Parser, + buffer: ByteTendril + }, + Parsing { + decoder: LossyDecoder>, + }, + Transient +} + +impl BytesParser { + /// Access the underlying Parser + pub fn str_parser(&self) -> &Parser { + match self.state { + BytesParserState::Initial { ref parser } => parser, + BytesParserState::Buffering { ref parser, .. } => parser, + BytesParserState::Parsing { ref decoder } => decoder.inner_sink(), + BytesParserState::Transient => unreachable!(), + } + } + + /// Access the underlying Parser + pub fn str_parser_mut(&mut self) -> &mut Parser { + match self.state { + BytesParserState::Initial { ref mut parser } => parser, + BytesParserState::Buffering { ref mut parser, .. } => parser, + BytesParserState::Parsing { ref mut decoder } => decoder.inner_sink_mut(), + BytesParserState::Transient => unreachable!(), + } + } + + /// Insert a Unicode chunk in the middle of the byte stream. + /// + /// This is e.g. for supporting `document.write`. + pub fn process_unicode(&mut self, t: StrTendril) { + if t.is_empty() { + return // Don’t prevent buffering/encoding detection + } + if let BytesParserState::Parsing { ref mut decoder } = self.state { + decoder.inner_sink_mut().process(t) + } else { + match mem::replace(&mut self.state, BytesParserState::Transient) { + BytesParserState::Initial { mut parser } => { + parser.process(t); + self.start_parsing(parser, ByteTendril::new()) + } + BytesParserState::Buffering { parser, buffer } => { + self.start_parsing(parser, buffer); + if let BytesParserState::Parsing { ref mut decoder } = self.state { + decoder.inner_sink_mut().process(t) + } else { + unreachable!() + } + } + BytesParserState::Parsing { .. } | BytesParserState::Transient => unreachable!(), + } + } + } + + fn start_parsing(&mut self, parser: Parser, buffer: ByteTendril) { + let encoding = detect_encoding(&buffer, &self.opts); + let mut decoder = LossyDecoder::new(encoding, parser); + decoder.process(buffer); + self.state = BytesParserState::Parsing { decoder: decoder } + } +} + +impl TendrilSink for BytesParser { + fn process(&mut self, t: ByteTendril) { + if let &mut BytesParserState::Parsing { ref mut decoder } = &mut self.state { + return decoder.process(t) + } + let (parser, buffer) = match mem::replace(&mut self.state, BytesParserState::Transient) { + BytesParserState::Initial{ parser } => (parser, t), + BytesParserState::Buffering { parser, mut buffer } => { + buffer.push_tendril(&t); + (parser, buffer) + } + BytesParserState::Parsing { .. } | BytesParserState::Transient => unreachable!(), + }; + if buffer.len32() >= PRESCAN_BYTES { + self.start_parsing(parser, buffer) + } else { + self.state = BytesParserState::Buffering { + parser: parser, + buffer: buffer, + } + } + } + + fn error(&mut self, desc: Cow<'static, str>) { + match self.state { + BytesParserState::Initial { ref mut parser } => parser.error(desc), + BytesParserState::Buffering { ref mut parser, .. } => parser.error(desc), + BytesParserState::Parsing { ref mut decoder } => decoder.error(desc), + BytesParserState::Transient => unreachable!(), + } + } + + type Output = Sink::Output; + + fn finish(self) -> Self::Output { + match self.state { + BytesParserState::Initial { parser } => parser.finish(), + BytesParserState::Buffering { parser, buffer } => { + let encoding = detect_encoding(&buffer, &self.opts); + let mut decoder = LossyDecoder::new(encoding, parser); + decoder.process(buffer); + decoder.finish() + }, + BytesParserState::Parsing { decoder } => decoder.finish(), + BytesParserState::Transient => unreachable!(), + } + } +} + +/// How many bytes does detect_encoding() need +// FIXME(#18): should be 1024 for elements. +const PRESCAN_BYTES: u32 = 3; + +/// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding +fn detect_encoding(bytes: &ByteTendril, opts: &BytesOpts) -> EncodingRef { + if bytes.starts_with(b"\xEF\xBB\xBF") { + return encoding::all::UTF_8 + } + if bytes.starts_with(b"\xFE\xFF") { + return encoding::all::UTF_16BE + } + if bytes.starts_with(b"\xFF\xFE") { + return encoding::all::UTF_16LE + } + if let Some(encoding) = opts.transport_layer_encoding { + return encoding + } + // FIXME(#18): etc. + return encoding::all::UTF_8 +} + +#[cfg(test)] +mod tests { + use rcdom::RcDom; + use serialize::serialize; + use std::iter::repeat; + use tendril::TendrilSink; + use super::*; + + #[test] + fn from_utf8() { + assert_serialization( + parse_document(RcDom::default(), ParseOpts::default()) + .from_utf8() + .one("Test".as_bytes())); + } + + #[test] + fn from_bytes_one() { + assert_serialization( + parse_document(RcDom::default(), ParseOpts::default()) + .from_bytes(BytesOpts::default()) + .one("<title>Test".as_bytes())); + } + + #[test] + fn from_bytes_iter() { + assert_serialization( + parse_document(RcDom::default(), ParseOpts::default()) + .from_bytes(BytesOpts::default()) + .from_iter([ + "<title>Test".as_bytes(), + repeat(' ').take(1200).collect::<String>().as_bytes(), + ].iter().cloned())); + } + + fn assert_serialization(dom: RcDom) { + let mut serialized = Vec::new(); + serialize(&mut serialized, &dom.document, Default::default()).unwrap(); + assert_eq!(String::from_utf8(serialized).unwrap().replace(" ", ""), + "<html><head><title>Test"); + } +} diff --git a/html5ever-2015-05-15/src/lib.rs b/html5ever-2016-08-25/src/lib.rs similarity index 65% rename from html5ever-2015-05-15/src/lib.rs rename to html5ever-2016-08-25/src/lib.rs index 9d01104c8..f394953c2 100644 --- a/html5ever-2015-05-15/src/lib.rs +++ b/html5ever-2016-08-25/src/lib.rs @@ -10,13 +10,13 @@ #![crate_name="html5ever"] #![crate_type="dylib"] -#![feature(plugin, box_syntax, str_char, slice_patterns)] -#![feature(str_escape, iter_arith)] -#![deny(warnings)] +#![cfg_attr(test, deny(warnings))] #![allow(unused_parens)] -#![plugin(phf_macros)] -#![plugin(html5ever_macros)] +#![cfg_attr(feature = "heap_size", feature(plugin, custom_derive))] +#![cfg_attr(feature = "heap_size", plugin(heapsize_plugin))] +#[cfg(feature = "heap_size")] +extern crate heapsize; #[macro_use] extern crate log; @@ -29,10 +29,8 @@ extern crate mac; extern crate phf; -extern crate time; - pub use tokenizer::Attribute; -pub use driver::{one_input, ParseOpts, parse_to, parse_fragment_to, parse, parse_fragment}; +pub use driver::{ParseOpts, parse_document, parse_fragment, Parser}; pub use serialize::serialize; @@ -47,7 +45,15 @@ mod util { pub mod tokenizer; pub mod tree_builder; - pub mod serialize; - pub mod driver; +pub mod rcdom; + +/// Re-export the tendril crate. +pub mod tendril { + extern crate tendril; + pub use self::tendril::*; +} + +/// Re-export the encoding crate. +pub use tendril::encoding; diff --git a/html5ever-2015-05-15/src/macros.rs b/html5ever-2016-08-25/src/macros.rs similarity index 84% rename from html5ever-2015-05-15/src/macros.rs rename to html5ever-2016-08-25/src/macros.rs index 15eeff594..82815191a 100644 --- a/html5ever-2015-05-15/src/macros.rs +++ b/html5ever-2016-08-25/src/macros.rs @@ -24,9 +24,10 @@ macro_rules! unwrap_or_return { macro_rules! time { ($e:expr) => {{ - let t0 = ::time::precise_time_ns(); + let now = ::std::time::Instant::now(); let result = $e; - let dt = ::time::precise_time_ns() - t0; + let d = now.elapsed(); + let dt = d.as_secs() * 1_000_000_000 + u64::from(d.subsec_nanos()); (result, dt) }} } diff --git a/html5ever-2015-05-15/dom_sink/src/rcdom.rs b/html5ever-2016-08-25/src/rcdom.rs similarity index 63% rename from html5ever-2015-05-15/dom_sink/src/rcdom.rs rename to html5ever-2016-08-25/src/rcdom.rs index a09b2a881..5ba8faf37 100644 --- a/html5ever-2015-05-15/dom_sink/src/rcdom.rs +++ b/html5ever-2016-08-25/src/rcdom.rs @@ -12,35 +12,71 @@ //! This is sufficient as a static parse tree, but don't build a //! web browser using it. :) -use common::{NodeEnum, Document, Doctype, Text, Comment, Element}; - -use html5ever::tokenizer::Attribute; -use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText}; -use html5ever::tree_builder; -use html5ever::serialize::{Serializable, Serializer}; -use html5ever::serialize::TraversalScope; -use html5ever::serialize::TraversalScope::{IncludeNode, ChildrenOnly}; -use html5ever::driver::ParseResult; - +use std::ascii::AsciiExt; use std::cell::RefCell; +use std::collections::HashSet; use std::default::Default; -use std::rc::{Rc, Weak}; use std::borrow::Cow; use std::io::{self, Write}; +use std::mem; use std::ops::{Deref, DerefMut}; +use std::rc::{Rc, Weak}; use string_cache::QualName; +use tendril::StrTendril; + +use tokenizer::Attribute; +use tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText}; +use tree_builder; +use serialize::{Serializable, Serializer}; +use serialize::TraversalScope; +use serialize::TraversalScope::{IncludeNode, ChildrenOnly}; + +pub use self::ElementEnum::{AnnotationXml, Normal, Script, Template}; +pub use self::NodeEnum::{Document, Doctype, Text, Comment, Element}; + +/// The different kinds of elements in the DOM. +#[derive(Debug)] +pub enum ElementEnum { + Normal, + /// A script element and its "already started" flag. + /// https://html.spec.whatwg.org/multipage/#already-started + Script(bool), + /// A template element and its template contents. + /// https://html.spec.whatwg.org/multipage/#template-contents + Template(Handle), + /// An annotation-xml element in the MathML namespace whose start tag token had an attribute + /// with the name "encoding" whose value was an ASCII case-insensitive match for the string + /// "text/html" or "application/xhtml+xml" + /// https://html.spec.whatwg.org/multipage/embedded-content.html#math:annotation-xml + AnnotationXml(bool), +} + +/// The different kinds of nodes in the DOM. +#[derive(Debug)] +pub enum NodeEnum { + /// The `Document` itself. + Document, + + /// A `DOCTYPE` with name, public id, and system id. + Doctype(StrTendril, StrTendril, StrTendril), + + /// A text node. + Text(StrTendril), + + /// A comment. + Comment(StrTendril), + + /// An element with attributes. + Element(QualName, ElementEnum, Vec), +} /// A DOM node. +#[derive(Debug)] pub struct Node { pub node: NodeEnum, pub parent: Option, pub children: Vec, - - /// The "script already started" flag. - /// - /// Not meaningful for nodes other than HTML ` not fully implemented"); + if self.sink.complete_script(node) == + NextParserState::Suspend { + self.next_tokenizer_state = + Some(Quiescent); + } + } + self.mode = self.orig_mode.take().unwrap(); + Done + } + (_, _) => panic!("impossible case in Text mode"), + } + } + }, + InTable => + match token { + NullCharacterToken => self.process_chars_in_table(token), + CharacterTokens(..) => self.process_chars_in_table(token), + CommentToken(text) => self.append_comment(text), + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("caption"), .. }) + => { + self.pop_until_current(table_scope); + self.active_formatting.push(Marker); + self.insert_element_for(tag); + self.mode = InCaption; + Done + } + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("colgroup"), .. }) + => { + self.pop_until_current(table_scope); + self.insert_element_for(tag); + self.mode = InColumnGroup; + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("col"), .. }) => { + self.pop_until_current(table_scope); + self.insert_phantom(atom!("colgroup")); + Reprocess(InColumnGroup, token) + } + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("thead"), .. }) => + { + self.pop_until_current(table_scope); + self.insert_element_for(tag); + self.mode = InTableBody; + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("td"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("th"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tr"), .. }) => { + self.pop_until_current(table_scope); + self.insert_phantom(atom!("tbody")); + Reprocess(InTableBody, token) + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("table"), .. }) => + { + self.unexpected(&token); + if self.in_scope_named(table_scope, atom!("table")) { + self.pop_until_named(atom!("table")); + Reprocess(self.reset_insertion_mode(), token) + } else { Done } + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("table"), .. }) => + { + if self.in_scope_named(table_scope, atom!("table")) { + self.pop_until_named(atom!("table")); + self.mode = self.reset_insertion_mode(); + } else { self.unexpected(&token); } + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("body"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("caption"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("col"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("colgroup"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("html"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("td"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("th"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("thead"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tr"), .. }) => + self.unexpected(&token), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("style"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("script"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("template"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("template"), .. }) + => self.step(InHead, token), + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("input"), .. }) => + { + self.unexpected(&tag); + if self.is_type_hidden(&tag) { + self.insert_and_pop_element_for(tag); + DoneAckSelfClosing + } else { self.foster_parent_in_body(TagToken(tag)) } + } + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("form"), .. }) => + { + self.unexpected(&tag); + if !self.in_html_elem_named(atom!("template")) && + self.form_elem.is_none() { + self.form_elem = + Some(self.insert_and_pop_element_for(tag)); + } + Done + } + EOFToken => self.step(InBody, token), + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => { + self.unexpected(&token); + self.foster_parent_in_body(token) + } + } + } + }, + InTableText => + match token { + NullCharacterToken => self.unexpected(&token), + CharacterTokens(split, text) => { + self.pending_table_text.push((split, text)); + Done + } + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => { + let pending = + replace(&mut self.pending_table_text, vec!()); + let contains_nonspace = + pending.iter().any(|&(split, ref text)| { + match split { + Whitespace => false, + NotWhitespace => true, + NotSplit => + any_not_whitespace(text), + } }); + if contains_nonspace { + self.sink.parse_error(Borrowed("Non-space table text")); + for (split, text) in pending.into_iter() { + match self.foster_parent_in_body(CharacterTokens(split, + text)) + { + Done => (), + _ => + panic!("not prepared to handle this!"), + } + } + } else { + for (_, text) in pending.into_iter() { + self.append_text(text); + } + } + Reprocess(self.orig_mode.take().unwrap(), token) + } + } + } + }, + InCaption => + match token { + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("caption"), .. }) + | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("col"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("colgroup"), .. }) + | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("td"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("th"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("thead"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tr"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("table"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("caption"), .. }) + => { + if self.in_scope_named(table_scope, atom!("caption")) { + self.generate_implied_end(cursory_implied_end); + self.expect_to_close(atom!("caption")); + self.clear_active_formatting_to_marker(); + match tag { + Tag { kind: EndTag, name: atom!("caption"), .. } + => { + self.mode = InTable; + Done + } + _ => Reprocess(InTable, TagToken(tag)), + } + } else { self.unexpected(&tag); Done } + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("body"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("col"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("colgroup"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("html"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("td"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("th"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("thead"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tr"), .. }) => + self.unexpected(&token), + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => self.step(InBody, token), + } + } + }, + InColumnGroup => + match token { + CharacterTokens(NotSplit, text) => SplitWhitespace(text), + CharacterTokens(Whitespace, text) => self.append_text(text), + CommentToken(text) => self.append_comment(text), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("html"), .. }) => + self.step(InBody, token), + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("col"), .. }) => { + self.insert_and_pop_element_for(tag); + DoneAckSelfClosing + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("colgroup"), .. }) + => { + if self.current_node_named(atom!("colgroup")) { + self.pop(); + self.mode = InTable; + } else { self.unexpected(&token); } + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("col"), .. }) => + self.unexpected(&token), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("template"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("template"), .. }) + => self.step(InHead, token), + EOFToken => self.step(InBody, token), + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => { + if self.current_node_named(atom!("colgroup")) { + self.pop(); + Reprocess(InTable, token) + } else { self.unexpected(&token) } + } + } + } + }, + InTableBody => + match token { + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tr"), .. }) => { + self.pop_until_current(table_body_context); + self.insert_element_for(tag); + self.mode = InRow; + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("th"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("td"), .. }) => { + self.unexpected(&token); + self.pop_until_current(table_body_context); + self.insert_phantom(atom!("tr")); + Reprocess(InRow, token) + } + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("thead"), .. }) => + { + if self.in_scope_named(table_scope, tag.name.clone()) { + self.pop_until_current(table_body_context); + self.pop(); + self.mode = InTable; + } else { self.unexpected(&tag); } + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("caption"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("col"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("colgroup"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("thead"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("table"), .. }) => + { + declare_tag_set!(table_outer = "table" "tbody" "tfoot"); + if self.in_scope(table_scope, + |e| self.elem_in(e, table_outer)) { + self.pop_until_current(table_body_context); + self.pop(); + Reprocess(InTable, token) + } else { self.unexpected(&token) } + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("body"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("caption"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("col"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("colgroup"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("html"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("td"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("th"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tr"), .. }) => + self.unexpected(&token), + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => self.step(InTable, token), + } + } + }, + InRow => + match token { + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("th"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("td"), .. }) => { + self.pop_until_current(table_row_context); + self.insert_element_for(tag); + self.mode = InCell; + self.active_formatting.push(Marker); + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tr"), .. }) => { + if self.in_scope_named(table_scope, atom!("tr")) { + self.pop_until_current(table_row_context); + let node = self.pop(); + self.assert_named(node, atom!("tr")); + self.mode = InTableBody; + } else { self.unexpected(&token); } + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("caption"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("col"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("colgroup"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("thead"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tr"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("table"), .. }) => + { + if self.in_scope_named(table_scope, atom!("tr")) { + self.pop_until_current(table_row_context); + let node = self.pop(); + self.assert_named(node, atom!("tr")); + Reprocess(InTableBody, token) + } else { self.unexpected(&token) } + } + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("thead"), .. }) => + { + if self.in_scope_named(table_scope, tag.name.clone()) { + if self.in_scope_named(table_scope, atom!("tr")) { + self.pop_until_current(table_row_context); + let node = self.pop(); + self.assert_named(node, atom!("tr")); + Reprocess(InTableBody, TagToken(tag)) + } else { Done } + } else { self.unexpected(&tag) } + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("body"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("caption"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("col"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("colgroup"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("html"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("td"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("th"), .. }) => + self.unexpected(&token), + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => self.step(InTable, token), + } + } + }, + InCell => + match token { + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("td"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("th"), .. }) => { + if self.in_scope_named(table_scope, tag.name.clone()) { + self.generate_implied_end(cursory_implied_end); + self.expect_to_close(tag.name); + self.clear_active_formatting_to_marker(); + self.mode = InRow; + } else { self.unexpected(&tag); } + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("caption"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("col"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("colgroup"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("td"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("th"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("thead"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tr"), .. }) => { + if self.in_scope(table_scope, + |n| self.elem_in(n.clone(), td_th)) { + self.close_the_cell(); + Reprocess(InRow, token) + } else { self.unexpected(&token) } + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("body"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("caption"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("col"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("colgroup"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("html"), .. }) => + self.unexpected(&token), + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("table"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("thead"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tr"), .. }) => { + if self.in_scope_named(table_scope, tag.name.clone()) { + self.close_the_cell(); + Reprocess(InRow, TagToken(tag)) + } else { self.unexpected(&tag) } + } + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => self.step(InBody, token), + } + } + }, + InSelect => + match token { + NullCharacterToken => self.unexpected(&token), + CharacterTokens(_, text) => self.append_text(text), + CommentToken(text) => self.append_comment(text), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("html"), .. }) => + self.step(InBody, token), + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("option"), .. }) + => { + if self.current_node_named(atom!("option")) { + self.pop(); + } + self.insert_element_for(tag); + Done + } + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("optgroup"), .. }) + => { + if self.current_node_named(atom!("option")) { + self.pop(); + } + if self.current_node_named(atom!("optgroup")) { + self.pop(); + } + self.insert_element_for(tag); + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("optgroup"), .. }) + => { + if self.open_elems.len() >= 2 && + self.current_node_named(atom!("option")) && + self.html_elem_named(self.open_elems[self.open_elems.len() + - + 2].clone(), + atom!("optgroup")) { + self.pop(); + } + if self.current_node_named(atom!("optgroup")) { + self.pop(); + } else { self.unexpected(&token); } + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("option"), .. }) + => { + if self.current_node_named(atom!("option")) { + self.pop(); + } else { self.unexpected(&token); } + Done + } + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("select"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("select"), .. }) + => { + let in_scope = + self.in_scope_named(select_scope, atom!("select")); + if !in_scope || tag.kind == StartTag { + self.unexpected(&tag); + } + if in_scope { + self.pop_until_named(atom!("select")); + self.mode = self.reset_insertion_mode(); + } + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("input"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("keygen"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("textarea"), .. }) + => { + self.unexpected(&token); + if self.in_scope_named(select_scope, atom!("select")) { + self.pop_until_named(atom!("select")); + Reprocess(self.reset_insertion_mode(), token) + } else { Done } + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("script"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("template"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("template"), .. }) + => self.step(InHead, token), + EOFToken => self.step(InBody, token), + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => self.unexpected(&token), + } + } + }, + InSelectInTable => + match token { + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("caption"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("table"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("thead"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tr"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("td"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("th"), .. }) => { + self.unexpected(&token); + self.pop_until_named(atom!("select")); + Reprocess(self.reset_insertion_mode(), token) + } + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("caption"), .. }) + | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("table"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("thead"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("tr"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("td"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("th"), .. }) => { + self.unexpected(&tag); + if self.in_scope_named(table_scope, tag.name.clone()) { + self.pop_until_named(atom!("select")); + Reprocess(self.reset_insertion_mode(), TagToken(tag)) + } else { Done } + } + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => self.step(InSelect, token), + } + } + }, + InTemplate => + match token { + CharacterTokens(_, _) => self.step(InBody, token), + CommentToken(_) => self.step(InBody, token), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("base"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("basefont"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("bgsound"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("link"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("meta"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("noframes"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("script"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("style"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("template"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("title"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("template"), .. }) + => { + self.step(InHead, token) + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("caption"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("colgroup"), .. }) + | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tbody"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tfoot"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("thead"), .. }) => + { + self.template_modes.pop(); + self.template_modes.push(InTable); + Reprocess(InTable, token) + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("col"), .. }) => { + self.template_modes.pop(); + self.template_modes.push(InColumnGroup); + Reprocess(InColumnGroup, token) + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tr"), .. }) => { + self.template_modes.pop(); + self.template_modes.push(InTableBody); + Reprocess(InTableBody, token) + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("td"), .. }) | + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("th"), .. }) => { + self.template_modes.pop(); + self.template_modes.push(InRow); + Reprocess(InRow, token) + } + EOFToken => { + if !self.in_html_elem_named(atom!("template")) { + self.stop_parsing() + } else { + self.unexpected(&token); + self.pop_until_named(atom!("template")); + self.clear_active_formatting_to_marker(); + self.template_modes.pop(); + self.mode = self.reset_insertion_mode(); + Reprocess(self.reset_insertion_mode(), token) + } + } + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (true, + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + .. })) => { + self.template_modes.pop(); + self.template_modes.push(InBody); + Reprocess(InBody, TagToken(tag)) + } + (_, token) => self.unexpected(&token), + } + } + }, + AfterBody => + match token { + CharacterTokens(NotSplit, text) => SplitWhitespace(text), + CharacterTokens(Whitespace, _) => self.step(InBody, token), + CommentToken(text) => self.append_comment_to_html(text), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("html"), .. }) => + self.step(InBody, token), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("html"), .. }) => + { + if self.is_fragment() { + self.unexpected(&token); + } else { self.mode = AfterAfterBody; } + Done + } + EOFToken => self.stop_parsing(), + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => { + self.unexpected(&token); + Reprocess(InBody, token) + } + } + } + }, + InFrameset => + match token { + CharacterTokens(NotSplit, text) => SplitWhitespace(text), + CharacterTokens(Whitespace, text) => self.append_text(text), + CommentToken(text) => self.append_comment(text), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("html"), .. }) => + self.step(InBody, token), + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("frameset"), .. }) + => { + self.insert_element_for(tag); + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("frameset"), .. }) + => { + if self.open_elems.len() == 1 { + self.unexpected(&token); + } else { + self.pop(); + if !self.is_fragment() && + !self.current_node_named(atom!("frameset")) { + self.mode = AfterFrameset; + } + } + Done + } + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("frame"), .. }) => + { + self.insert_and_pop_element_for(tag); + DoneAckSelfClosing + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("noframes"), .. }) + => self.step(InHead, token), + EOFToken => { + if self.open_elems.len() != 1 { self.unexpected(&token); } + self.stop_parsing() + } + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => self.unexpected(&token), + } + } + }, + AfterFrameset => + match token { + CharacterTokens(NotSplit, text) => SplitWhitespace(text), + CharacterTokens(Whitespace, text) => self.append_text(text), + CommentToken(text) => self.append_comment(text), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("html"), .. }) => + self.step(InBody, token), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::EndTag, + name: atom!("html"), .. }) => + { + self.mode = AfterAfterFrameset; + Done + } + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("noframes"), .. }) + => self.step(InHead, token), + EOFToken => self.stop_parsing(), + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => self.unexpected(&token), + } + } + }, + AfterAfterBody => + match token { + CharacterTokens(NotSplit, text) => SplitWhitespace(text), + CharacterTokens(Whitespace, _) => self.step(InBody, token), + CommentToken(text) => self.append_comment_to_doc(text), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("html"), .. }) => + self.step(InBody, token), + EOFToken => self.stop_parsing(), + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => { + self.unexpected(&token); + Reprocess(InBody, token) + } + } + } + }, + AfterAfterFrameset => + match token { + CharacterTokens(NotSplit, text) => SplitWhitespace(text), + CharacterTokens(Whitespace, _) => self.step(InBody, token), + CommentToken(text) => self.append_comment_to_doc(text), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("html"), .. }) => + self.step(InBody, token), + EOFToken => self.stop_parsing(), + ::tree_builder::types::TagToken(::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("noframes"), .. }) + => self.step(InHead, token), + last_arm_token => { + let enable_wildcards = + match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (_, token) => self.unexpected(&token), + } + } + }, + } + } + fn step_foreign(&mut self, token: Token) -> ProcessResult { + match token { + NullCharacterToken => { + self.unexpected(&token); + self.append_text("\u{fffd}".to_tendril()) + } + CharacterTokens(_, text) => { + if any_not_whitespace(&text) { self.frameset_ok = false; } + self.append_text(text) + } + CommentToken(text) => self.append_comment(text), + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("b"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("big"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("blockquote"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("body"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("br"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("center"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("code"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("dd"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("div"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("dl"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("dt"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("em"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("embed"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("h1"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("h2"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("h3"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("h4"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("h5"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("h6"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("head"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("hr"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("i"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("img"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("li"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("listing"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("menu"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("meta"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("nobr"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("ol"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("p"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("pre"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("ruby"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("s"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("small"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("span"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("strong"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("strike"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("sub"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("sup"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("table"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("tt"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("u"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("ul"), .. }) | + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("var"), .. }) => + self.unexpected_start_tag_in_foreign_content(tag), + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + name: atom!("font"), .. }) => { + let unexpected = + tag.attrs.iter().any(|attr| { + matches!(attr . name , qualname ! ( + "" , "color" ) | qualname ! + ( "" , "face" ) | qualname ! + ( "" , "size" )) }); + if unexpected { + self.unexpected_start_tag_in_foreign_content(tag) + } else { self.foreign_start_tag(tag) } + } + last_arm_token => { + let enable_wildcards = match last_arm_token { _ => true, }; + match (enable_wildcards, last_arm_token) { + (true, + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::StartTag, + .. })) => + self.foreign_start_tag(tag), + (true, + ::tree_builder::types::TagToken(tag@::tokenizer::Tag { + kind: ::tokenizer::EndTag, + .. })) => { + let mut first = true; + let mut stack_idx = self.open_elems.len() - 1; + loop { + if stack_idx == 0 { return Done; } + let node = self.open_elems[stack_idx].clone(); + let node_name = self.sink.elem_name(node); + if !first && node_name.ns == ns!(html) { + let mode = self.mode; + return self.step(mode, TagToken(tag)); + } + if (&*node_name.local).eq_ignore_ascii_case(&*tag.name) + { + self.open_elems.truncate(stack_idx); + return Done; + } + if first { self.unexpected(&tag); first = false; } + stack_idx -= 1; + } + } + (_, _) => panic!("impossible case in foreign content"), + } + } + } + } +} \ No newline at end of file diff --git a/html5ever-2015-05-15/src/tree_builder/rules.rs b/html5ever-2016-08-25/src/tree_builder/rules.rs similarity index 79% rename from html5ever-2015-05-15/src/tree_builder/rules.rs rename to html5ever-2016-08-25/src/tree_builder/rules.rs index ce01bfab5..c5c12108a 100644 --- a/html5ever-2015-05-15/src/tree_builder/rules.rs +++ b/html5ever-2016-08-25/src/tree_builder/rules.rs @@ -11,19 +11,22 @@ use tree_builder::types::*; use tree_builder::tag_sets::*; -use tree_builder::actions::TreeBuilderActions; +use tree_builder::actions::{NoPush, Push, TreeBuilderActions}; use tree_builder::interface::{TreeSink, Quirks, AppendNode, NextParserState}; -use tokenizer::{Tag, StartTag, EndTag}; +use tokenizer::{Attribute, EndTag, StartTag, Tag}; use tokenizer::states::{Rcdata, Rawtext, ScriptData, Plaintext, Quiescent}; -use util::str::{AsciiExt, is_ascii_whitespace}; +use util::str::is_ascii_whitespace; +use std::ascii::AsciiExt; use std::mem::replace; use std::borrow::Cow::Borrowed; use std::borrow::ToOwned; -fn any_not_whitespace(x: &String) -> bool { +use tendril::{StrTendril, SliceExt}; + +fn any_not_whitespace(x: &StrTendril) -> bool { // FIXME: this might be much faster as a byte scan x.chars().any(|c| !is_ascii_whitespace(c)) } @@ -99,7 +102,7 @@ impl TreeBuilderStep tag @ => self.unexpected(&tag), token => { - self.head_elem = Some(self.insert_phantom(atom!(head))); + self.head_elem = Some(self.insert_phantom(atom!("head"))); Reprocess(InHead, token) } }), @@ -124,7 +127,7 @@ impl TreeBuilderStep } tag @ <style> <noscript> => { - if (!self.opts.scripting_enabled) && (tag.name == atom!(noscript)) { + if (!self.opts.scripting_enabled) && (tag.name == atom!("noscript")) { self.insert_element_for(tag); self.mode = InHeadNoscript; } else { @@ -134,7 +137,7 @@ impl<Handle, Sink> TreeBuilderStep } tag @ <script> => { - let elem = self.sink.create_element(qualname!(HTML, script), tag.attrs); + let elem = self.sink.create_element(qualname!(html, "script"), tag.attrs); if self.is_fragment() { self.sink.mark_script_already_started(elem.clone()); } @@ -162,11 +165,11 @@ impl<Handle, Sink> TreeBuilderStep } tag @ </template> => { - if !self.in_scope_named(default_scope, atom!(template)) { + if !self.in_html_elem_named(atom!("template")) { self.unexpected(&tag); } else { self.generate_implied_end(thorough_implied_end); - self.expect_to_close(atom!(template)); + self.expect_to_close(atom!("template")); self.clear_active_formatting_to_marker(); self.template_modes.pop(); self.mode = self.reset_insertion_mode(); @@ -252,7 +255,7 @@ impl<Handle, Sink> TreeBuilderStep tag @ </_> => self.unexpected(&tag), token => { - self.insert_phantom(atom!(body)); + self.insert_phantom(atom!("body")); Reprocess(InBody, token) } }), @@ -273,9 +276,10 @@ impl<Handle, Sink> TreeBuilderStep tag @ <html> => { self.unexpected(&tag); - // FIXME: <template> - let top = self.html_elem(); - self.sink.add_attrs_if_missing(top, tag.attrs); + if !self.in_html_elem_named(atom!("template")) { + let top = self.html_elem(); + self.sink.add_attrs_if_missing(top, tag.attrs); + } Done } @@ -286,13 +290,13 @@ impl<Handle, Sink> TreeBuilderStep tag @ <body> => { self.unexpected(&tag); - // FIXME: <template> match self.body_elem() { - None => (), - Some(node) => { + Some(ref node) if self.open_elems.len() != 1 && + !self.in_html_elem_named(atom!("template")) => { self.frameset_ok = false; - self.sink.add_attrs_if_missing(node, tag.attrs) - } + self.sink.add_attrs_if_missing(node.clone(), tag.attrs) + }, + _ => {} } Done } @@ -317,13 +321,16 @@ impl<Handle, Sink> TreeBuilderStep } EOFToken => { - // FIXME: <template> - self.check_body_end(); - self.stop_parsing() + if !self.template_modes.is_empty() { + self.step(InTemplate, token) + } else { + self.check_body_end(); + self.stop_parsing() + } } </body> => { - if self.in_scope_named(default_scope, atom!(body)) { + if self.in_scope_named(default_scope, atom!("body")) { self.check_body_end(); self.mode = AfterBody; } else { @@ -333,7 +340,7 @@ impl<Handle, Sink> TreeBuilderStep } </html> => { - if self.in_scope_named(default_scope, atom!(body)) { + if self.in_scope_named(default_scope, atom!("body")) { self.check_body_end(); Reprocess(AfterBody, token) } else { @@ -369,25 +376,26 @@ impl<Handle, Sink> TreeBuilderStep } tag @ <form> => { - // FIXME: <template> - if self.form_elem.is_some() { + if self.form_elem.is_some() && + !self.in_html_elem_named(atom!("template")) { self.sink.parse_error(Borrowed("nested forms")); } else { self.close_p_element_in_button_scope(); let elem = self.insert_element_for(tag); - // FIXME: <template> - self.form_elem = Some(elem); + if !self.in_html_elem_named(atom!("template")) { + self.form_elem = Some(elem); + } } Done } tag @ <li> <dd> <dt> => { - declare_tag_set!(close_list = li); - declare_tag_set!(close_defn = dd dt); - declare_tag_set!(extra_special = special_tag - address div p); + declare_tag_set!(close_list = "li"); + declare_tag_set!(close_defn = "dd" "dt"); + declare_tag_set!(extra_special = [special_tag] - "address" "div" "p"); let can_close: fn(::string_cache::QualName) -> bool = match tag.name { - atom!(li) => close_list, - atom!(dd) | atom!(dt) => close_defn, + atom!("li") => close_list, + atom!("dd") | atom!("dt") => close_defn, _ => unreachable!(), }; @@ -426,10 +434,10 @@ impl<Handle, Sink> TreeBuilderStep } tag @ <button> => { - if self.in_scope_named(default_scope, atom!(button)) { + if self.in_scope_named(default_scope, atom!("button")) { self.sink.parse_error(Borrowed("nested buttons")); self.generate_implied_end(cursory_implied_end); - self.pop_until_named(atom!(button)); + self.pop_until_named(atom!("button")); } self.reconstruct_formatting(); self.insert_element_for(tag); @@ -451,33 +459,43 @@ impl<Handle, Sink> TreeBuilderStep } </form> => { - // FIXME: <template> - // Can't use unwrap_or_return!() due to rust-lang/rust#16617. - let node = match self.form_elem.take() { - None => { - self.sink.parse_error(Borrowed("Null form element pointer on </form>")); + if !self.in_html_elem_named(atom!("template")) { + // Can't use unwrap_or_return!() due to rust-lang/rust#16617. + let node = match self.form_elem.take() { + None => { + self.sink.parse_error(Borrowed("Null form element pointer on </form>")); + return Done; + } + Some(x) => x, + }; + if !self.in_scope(default_scope, |n| self.sink.same_node(node.clone(), n)) { + self.sink.parse_error(Borrowed("Form element not in scope on </form>")); return Done; } - Some(x) => x, - }; - if !self.in_scope(default_scope, - |n| self.sink.same_node(node.clone(), n)) { - self.sink.parse_error(Borrowed("Form element not in scope on </form>")); - return Done; - } - self.generate_implied_end(cursory_implied_end); - let current = self.current_node(); - self.remove_from_stack(&node); - if !self.sink.same_node(current, node) { - self.sink.parse_error(Borrowed("Bad open element on </form>")); + self.generate_implied_end(cursory_implied_end); + let current = self.current_node(); + self.remove_from_stack(&node); + if !self.sink.same_node(current, node) { + self.sink.parse_error(Borrowed("Bad open element on </form>")); + } + } else { + if !self.in_scope_named(default_scope, atom!("form")) { + self.sink.parse_error(Borrowed("Form element not in scope on </form>")); + return Done; + } + self.generate_implied_end(cursory_implied_end); + if !self.current_node_named(atom!("form")) { + self.sink.parse_error(Borrowed("Bad open element on </form>")); + } + self.pop_until_named(atom!("form")); } Done } </p> => { - if !self.in_scope_named(button_scope, atom!(p)) { + if !self.in_scope_named(button_scope, atom!("p")) { self.sink.parse_error(Borrowed("No <p> tag to close")); - self.insert_phantom(atom!(p)); + self.insert_phantom(atom!("p")); } self.close_p_element(); Done @@ -485,7 +503,7 @@ impl<Handle, Sink> TreeBuilderStep tag @ </li> </dd> </dt> => { let scope: fn(::string_cache::QualName) -> bool = match tag.name { - atom!(li) => list_item_scope, + atom!("li") => list_item_scope, _ => default_scope, }; if self.in_scope_named(|x| scope(x), tag.name.clone()) { @@ -525,9 +543,9 @@ impl<Handle, Sink> TreeBuilderStep tag @ <nobr> => { self.reconstruct_formatting(); - if self.in_scope_named(default_scope, atom!(nobr)) { + if self.in_scope_named(default_scope, atom!("nobr")) { self.sink.parse_error(Borrowed("Nested <nobr>")); - self.adoption_agency(atom!(nobr)); + self.adoption_agency(atom!("nobr")); self.reconstruct_formatting(); } self.create_formatting_element_for(tag); @@ -580,7 +598,7 @@ impl<Handle, Sink> TreeBuilderStep tag @ <area> <br> <embed> <img> <keygen> <wbr> <input> => { let keep_frameset_ok = match tag.name { - atom!(input) => self.is_type_hidden(&tag), + atom!("input") => self.is_type_hidden(&tag), _ => false, }; self.reconstruct_formatting(); @@ -606,12 +624,53 @@ impl<Handle, Sink> TreeBuilderStep tag @ <image> => { self.unexpected(&tag); self.step(InBody, TagToken(Tag { - name: atom!(img), + name: atom!("img"), ..tag })) } - <isindex> => panic!("FIXME: <isindex> not implemented"), + tag @ <isindex> => { + self.unexpected(&tag); + let in_template = self.in_html_elem_named(atom!("template")); + if !in_template && self.form_elem.is_some() { + return Done; + } + self.frameset_ok = false; + self.close_p_element_in_button_scope(); + let mut form_attrs = vec![]; + let mut prompt = None; + let mut input_attrs = vec![]; + for attr in tag.attrs.into_iter() { + match attr.name { + qualname!("", "action") => form_attrs.push(attr), + qualname!("", "prompt") => prompt = Some(attr.value), + qualname!("", "name") => {}, + _ => input_attrs.push(attr), + } + } + input_attrs.push(Attribute { + name: qualname!("", "name"), + value: "isindex".to_tendril(), + }); + let form = self.insert_element(Push, ns!(html), atom!("form"), form_attrs); + if !in_template { + self.form_elem = Some(form.clone()); + } + self.insert_element(NoPush, ns!(html), atom!("hr"), vec![]); + self.reconstruct_formatting(); + self.insert_element(Push, ns!(html), atom!("label"), vec![]); + self.append_text(prompt.unwrap_or_else(|| { + "This is a searchable index. Enter search keywords: ".to_tendril() + })); + self.insert_element(NoPush, ns!(html), atom!("input"), input_attrs); + self.pop(); + self.insert_element(NoPush, ns!(html), atom!("hr"), vec![]); + self.pop(); + if !in_template { + self.form_elem = None; + } + DoneAckSelfClosing + } tag @ <textarea> => { self.ignore_lf = true; @@ -656,7 +715,7 @@ impl<Handle, Sink> TreeBuilderStep } tag @ <optgroup> <option> => { - if self.current_node_named(atom!(option)) { + if self.current_node_named(atom!("option")) { self.pop(); } self.reconstruct_formatting(); @@ -664,20 +723,31 @@ impl<Handle, Sink> TreeBuilderStep Done } - tag @ <rp> <rt> => { - if self.in_scope_named(default_scope, atom!(ruby)) { + tag @ <rb> <rtc> => { + if self.in_scope_named(default_scope, atom!("ruby")) { self.generate_implied_end(cursory_implied_end); } - if !self.current_node_named(atom!(ruby)) { + if !self.current_node_named(atom!("ruby")) { + self.unexpected(&tag); + } + self.insert_element_for(tag); + Done + } + + tag @ <rp> <rt> => { + if self.in_scope_named(default_scope, atom!("ruby")) { + self.generate_implied_end_except(atom!("rtc")); + } + if !self.current_node_named(atom!("rtc")) && !self.current_node_named(atom!("ruby")) { self.unexpected(&tag); } self.insert_element_for(tag); Done } - tag @ <math> => self.enter_foreign(tag, ns!(MathML)), + tag @ <math> => self.enter_foreign(tag, ns!(mathml)), - tag @ <svg> => self.enter_foreign(tag, ns!(SVG)), + tag @ <svg> => self.enter_foreign(tag, ns!(svg)), <caption> <col> <colgroup> <frame> <head> <tbody> <td> <tfoot> <th> <thead> <tr> => { @@ -686,7 +756,7 @@ impl<Handle, Sink> TreeBuilderStep } tag @ <_> => { - if self.opts.scripting_enabled && tag.name == atom!(noscript) { + if self.opts.scripting_enabled && tag.name == atom!("noscript") { self.parse_raw_data(tag, Rawtext); } else { self.reconstruct_formatting(); @@ -711,7 +781,7 @@ impl<Handle, Sink> TreeBuilderStep EOFToken => { self.unexpected(&token); - if self.current_node_named(atom!(script)) { + if self.current_node_named(atom!("script")) { let current = self.current_node(); self.sink.mark_script_already_started(current); } @@ -721,7 +791,7 @@ impl<Handle, Sink> TreeBuilderStep tag @ </_> => { let node = self.pop(); - if tag.name == atom!(script) { + if tag.name == atom!("script") { warn!("FIXME: </script> not fully implemented"); if self.sink.complete_script(node) == NextParserState::Suspend { self.next_tokenizer_state = Some(Quiescent); @@ -762,7 +832,7 @@ impl<Handle, Sink> TreeBuilderStep <col> => { self.pop_until_current(table_scope); - self.insert_phantom(atom!(colgroup)); + self.insert_phantom(atom!("colgroup")); Reprocess(InColumnGroup, token) } @@ -775,14 +845,14 @@ impl<Handle, Sink> TreeBuilderStep <td> <th> <tr> => { self.pop_until_current(table_scope); - self.insert_phantom(atom!(tbody)); + self.insert_phantom(atom!("tbody")); Reprocess(InTableBody, token) } <table> => { self.unexpected(&token); - if self.in_scope_named(table_scope, atom!(table)) { - self.pop_until_named(atom!(table)); + if self.in_scope_named(table_scope, atom!("table")) { + self.pop_until_named(atom!("table")); Reprocess(self.reset_insertion_mode(), token) } else { Done @@ -790,8 +860,8 @@ impl<Handle, Sink> TreeBuilderStep } </table> => { - if self.in_scope_named(table_scope, atom!(table)) { - self.pop_until_named(atom!(table)); + if self.in_scope_named(table_scope, atom!("table")) { + self.pop_until_named(atom!("table")); self.mode = self.reset_insertion_mode(); } else { self.unexpected(&token); @@ -818,8 +888,7 @@ impl<Handle, Sink> TreeBuilderStep tag @ <form> => { self.unexpected(&tag); - // FIXME: <template> - if self.form_elem.is_none() { + if !self.in_html_elem_named(atom!("template")) && self.form_elem.is_none() { self.form_elem = Some(self.insert_and_pop_element_for(tag)); } Done @@ -874,12 +943,12 @@ impl<Handle, Sink> TreeBuilderStep InCaption => match_token!(token { tag @ <caption> <col> <colgroup> <tbody> <td> <tfoot> <th> <thead> <tr> </table> </caption> => { - if self.in_scope_named(table_scope, atom!(caption)) { + if self.in_scope_named(table_scope, atom!("caption")) { self.generate_implied_end(cursory_implied_end); - self.expect_to_close(atom!(caption)); + self.expect_to_close(atom!("caption")); self.clear_active_formatting_to_marker(); match tag { - Tag { kind: EndTag, name: atom!(caption), .. } => { + Tag { kind: EndTag, name: atom!("caption"), .. } => { self.mode = InTable; Done } @@ -911,7 +980,7 @@ impl<Handle, Sink> TreeBuilderStep } </colgroup> => { - if self.current_node_named(atom!(colgroup)) { + if self.current_node_named(atom!("colgroup")) { self.pop(); self.mode = InTable; } else { @@ -927,7 +996,7 @@ impl<Handle, Sink> TreeBuilderStep EOFToken => self.step(InBody, token), token => { - if self.current_node_named(atom!(colgroup)) { + if self.current_node_named(atom!("colgroup")) { self.pop(); Reprocess(InTable, token) } else { @@ -948,7 +1017,7 @@ impl<Handle, Sink> TreeBuilderStep <th> <td> => { self.unexpected(&token); self.pop_until_current(table_body_context); - self.insert_phantom(atom!(tr)); + self.insert_phantom(atom!("tr")); Reprocess(InRow, token) } @@ -964,7 +1033,7 @@ impl<Handle, Sink> TreeBuilderStep } <caption> <col> <colgroup> <tbody> <tfoot> <thead> </table> => { - declare_tag_set!(table_outer = table tbody tfoot); + declare_tag_set!(table_outer = "table" "tbody" "tfoot"); if self.in_scope(table_scope, |e| self.elem_in(e, table_outer)) { self.pop_until_current(table_body_context); self.pop(); @@ -991,10 +1060,10 @@ impl<Handle, Sink> TreeBuilderStep } </tr> => { - if self.in_scope_named(table_scope, atom!(tr)) { + if self.in_scope_named(table_scope, atom!("tr")) { self.pop_until_current(table_row_context); let node = self.pop(); - self.assert_named(node, atom!(tr)); + self.assert_named(node, atom!("tr")); self.mode = InTableBody; } else { self.unexpected(&token); @@ -1003,10 +1072,10 @@ impl<Handle, Sink> TreeBuilderStep } <caption> <col> <colgroup> <tbody> <tfoot> <thead> <tr> </table> => { - if self.in_scope_named(table_scope, atom!(tr)) { + if self.in_scope_named(table_scope, atom!("tr")) { self.pop_until_current(table_row_context); let node = self.pop(); - self.assert_named(node, atom!(tr)); + self.assert_named(node, atom!("tr")); Reprocess(InTableBody, token) } else { self.unexpected(&token) @@ -1015,10 +1084,10 @@ impl<Handle, Sink> TreeBuilderStep tag @ </tbody> </tfoot> </thead> => { if self.in_scope_named(table_scope, tag.name.clone()) { - if self.in_scope_named(table_scope, atom!(tr)) { + if self.in_scope_named(table_scope, atom!("tr")) { self.pop_until_current(table_row_context); let node = self.pop(); - self.assert_named(node, atom!(tr)); + self.assert_named(node, atom!("tr")); Reprocess(InTableBody, TagToken(tag)) } else { Done @@ -1081,7 +1150,7 @@ impl<Handle, Sink> TreeBuilderStep <html> => self.step(InBody, token), tag @ <option> => { - if self.current_node_named(atom!(option)) { + if self.current_node_named(atom!("option")) { self.pop(); } self.insert_element_for(tag); @@ -1089,10 +1158,10 @@ impl<Handle, Sink> TreeBuilderStep } tag @ <optgroup> => { - if self.current_node_named(atom!(option)) { + if self.current_node_named(atom!("option")) { self.pop(); } - if self.current_node_named(atom!(optgroup)) { + if self.current_node_named(atom!("optgroup")) { self.pop(); } self.insert_element_for(tag); @@ -1101,12 +1170,12 @@ impl<Handle, Sink> TreeBuilderStep </optgroup> => { if self.open_elems.len() >= 2 - && self.current_node_named(atom!(option)) + && self.current_node_named(atom!("option")) && self.html_elem_named(self.open_elems[self.open_elems.len() - 2].clone(), - atom!(optgroup)) { + atom!("optgroup")) { self.pop(); } - if self.current_node_named(atom!(optgroup)) { + if self.current_node_named(atom!("optgroup")) { self.pop(); } else { self.unexpected(&token); @@ -1115,7 +1184,7 @@ impl<Handle, Sink> TreeBuilderStep } </option> => { - if self.current_node_named(atom!(option)) { + if self.current_node_named(atom!("option")) { self.pop(); } else { self.unexpected(&token); @@ -1124,14 +1193,14 @@ impl<Handle, Sink> TreeBuilderStep } tag @ <select> </select> => { - let in_scope = self.in_scope_named(select_scope, atom!(select)); + let in_scope = self.in_scope_named(select_scope, atom!("select")); if !in_scope || tag.kind == StartTag { self.unexpected(&tag); } if in_scope { - self.pop_until_named(atom!(select)); + self.pop_until_named(atom!("select")); self.mode = self.reset_insertion_mode(); } Done @@ -1139,8 +1208,8 @@ impl<Handle, Sink> TreeBuilderStep <input> <keygen> <textarea> => { self.unexpected(&token); - if self.in_scope_named(select_scope, atom!(select)) { - self.pop_until_named(atom!(select)); + if self.in_scope_named(select_scope, atom!("select")) { + self.pop_until_named(atom!("select")); Reprocess(self.reset_insertion_mode(), token) } else { Done @@ -1158,14 +1227,14 @@ impl<Handle, Sink> TreeBuilderStep InSelectInTable => match_token!(token { <caption> <table> <tbody> <tfoot> <thead> <tr> <td> <th> => { self.unexpected(&token); - self.pop_until_named(atom!(select)); + self.pop_until_named(atom!("select")); Reprocess(self.reset_insertion_mode(), token) } tag @ </caption> </table> </tbody> </tfoot> </thead> </tr> </td> </th> => { self.unexpected(&tag); if self.in_scope_named(table_scope, tag.name.clone()) { - self.pop_until_named(atom!(select)); + self.pop_until_named(atom!("select")); Reprocess(self.reset_insertion_mode(), TagToken(tag)) } else { Done @@ -1176,18 +1245,60 @@ impl<Handle, Sink> TreeBuilderStep }), //§ parsing-main-intemplate - InTemplate => { - // NB: Implementing <template> requires not just adding - // the InTemplate rules here, but also inserting various - // extra logic at other points noted throughout the - // parser. - - if self.opts.ignore_missing_rules { - self.step(InBody, token) - } else { - panic!("FIXME: <template> not implemented"); + InTemplate => match_token!(token { + CharacterTokens(_, _) => self.step(InBody, token), + CommentToken(_) => self.step(InBody, token), + + <base> <basefont> <bgsound> <link> <meta> <noframes> <script> + <style> <template> <title> </template> => { + self.step(InHead, token) + } + + <caption> <colgroup> <tbody> <tfoot> <thead> => { + self.template_modes.pop(); + self.template_modes.push(InTable); + Reprocess(InTable, token) + } + + <col> => { + self.template_modes.pop(); + self.template_modes.push(InColumnGroup); + Reprocess(InColumnGroup, token) + } + + <tr> => { + self.template_modes.pop(); + self.template_modes.push(InTableBody); + Reprocess(InTableBody, token) + } + + <td> <th> => { + self.template_modes.pop(); + self.template_modes.push(InRow); + Reprocess(InRow, token) } - } + + EOFToken => { + if !self.in_html_elem_named(atom!("template")) { + self.stop_parsing() + } else { + self.unexpected(&token); + self.pop_until_named(atom!("template")); + self.clear_active_formatting_to_marker(); + self.template_modes.pop(); + self.mode = self.reset_insertion_mode(); + Reprocess(self.reset_insertion_mode(), token) + } + } + + tag @ <_> => { + self.template_modes.pop(); + self.template_modes.push(InBody); + Reprocess(InBody, TagToken(tag)) + } + + token => self.unexpected(&token), + }), //§ parsing-main-afterbody AfterBody => match_token!(token { @@ -1232,7 +1343,7 @@ impl<Handle, Sink> TreeBuilderStep self.unexpected(&token); } else { self.pop(); - if !self.is_fragment() && !self.current_node_named(atom!(frameset)) { + if !self.is_fragment() && !self.current_node_named(atom!("frameset")) { self.mode = AfterFrameset; } } @@ -1314,7 +1425,7 @@ impl<Handle, Sink> TreeBuilderStep match_token!(token { NullCharacterToken => { self.unexpected(&token); - self.append_text("\u{fffd}".to_owned()) + self.append_text("\u{fffd}".to_tendril()) } CharacterTokens(_, text) => { @@ -1330,20 +1441,17 @@ impl<Handle, Sink> TreeBuilderStep <dt> <em> <embed> <h1> <h2> <h3> <h4> <h5> <h6> <head> <hr> <i> <img> <li> <listing> <menu> <meta> <nobr> <ol> <p> <pre> <ruby> <s> <small> <span> <strong> <strike> <sub> <sup> <table> <tt> - <u> <ul> <var> - => { - self.unexpected(&tag); - if self.is_fragment() { - self.foreign_start_tag(tag) + <u> <ul> <var> => self.unexpected_start_tag_in_foreign_content(tag), + + tag @ <font> => { + let unexpected = tag.attrs.iter().any(|attr| { + matches!(attr.name, + qualname!("", "color") | qualname!("", "face") | qualname!("", "size")) + }); + if unexpected { + self.unexpected_start_tag_in_foreign_content(tag) } else { - self.pop(); - while !self.current_node_in(|n| { - n.ns == ns!(HTML) || mathml_text_integration_point(n.clone()) - || html_integration_point(n) - }) { - self.pop(); - } - ReprocessForeign(TagToken(tag)) + self.foreign_start_tag(tag) } } @@ -1361,7 +1469,7 @@ impl<Handle, Sink> TreeBuilderStep let node = self.open_elems[stack_idx].clone(); let node_name = self.sink.elem_name(node); - if !first && node_name.ns == ns!(HTML) { + if !first && node_name.ns == ns!(html) { let mode = self.mode; return self.step(mode, TagToken(tag)); } diff --git a/html5ever-2016-08-25/src/tree_builder/tag_sets.rs b/html5ever-2016-08-25/src/tree_builder/tag_sets.rs new file mode 100644 index 000000000..35a4268c0 --- /dev/null +++ b/html5ever-2016-08-25/src/tree_builder/tag_sets.rs @@ -0,0 +1,95 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Various sets of HTML tag names, and macros for declaring them. + +use string_cache::QualName; + +macro_rules! declare_tag_set_impl ( ($param:ident, $b:ident, $supr:ident, $($tag:tt)+) => ( + match $param { + $( qualname!(html, $tag) => $b, )+ + p => $supr(p), + } +)); + +macro_rules! declare_tag_set_body ( + ($param:ident = [$supr:ident] - $($tag:tt)+) + => ( declare_tag_set_impl!($param, false, $supr, $($tag)+) ); + + ($param:ident = [$supr:ident] + $($tag:tt)+) + => ( declare_tag_set_impl!($param, true, $supr, $($tag)+) ); + + ($param:ident = $($tag:tt)+) + => ( declare_tag_set_impl!($param, true, empty_set, $($tag)+) ); +); + +macro_rules! declare_tag_set ( + (pub $name:ident = $($toks:tt)+) => ( + pub fn $name(p: ::string_cache::QualName) -> bool { + declare_tag_set_body!(p = $($toks)+) + } + ); + + ($name:ident = $($toks:tt)+) => ( + fn $name(p: ::string_cache::QualName) -> bool { + declare_tag_set_body!(p = $($toks)+) + } + ); +); + +#[inline(always)] pub fn empty_set(_: QualName) -> bool { false } +#[inline(always)] pub fn full_set(_: QualName) -> bool { true } + +declare_tag_set!(pub html_default_scope = + "applet" "caption" "html" "table" "td" "th" "marquee" "object" "template"); + +#[inline(always)] pub fn default_scope(name: QualName) -> bool { + html_default_scope(name.clone()) || + mathml_text_integration_point(name.clone()) || + svg_html_integration_point(name) +} + +declare_tag_set!(pub list_item_scope = [default_scope] + "ol" "ul"); +declare_tag_set!(pub button_scope = [default_scope] + "button"); +declare_tag_set!(pub table_scope = "html" "table" "template"); +declare_tag_set!(pub select_scope = [full_set] - "optgroup" "option"); + +declare_tag_set!(pub table_body_context = "tbody" "tfoot" "thead" "template" "html"); +declare_tag_set!(pub table_row_context = "tr" "template" "html"); +declare_tag_set!(pub td_th = "td" "th"); + +declare_tag_set!(pub cursory_implied_end = "dd" "dt" "li" "option" "optgroup" "p" "rb" "rp" "rt" "rtc"); + +declare_tag_set!(pub thorough_implied_end = [cursory_implied_end] + + "caption" "colgroup" "tbody" "td" "tfoot" "th" "thead" "tr"); + +declare_tag_set!(pub heading_tag = "h1" "h2" "h3" "h4" "h5" "h6"); + +declare_tag_set!(pub special_tag = + "address" "applet" "area" "article" "aside" "base" "basefont" "bgsound" "blockquote" "body" + "br" "button" "caption" "center" "col" "colgroup" "dd" "details" "dir" "div" "dl" "dt" "embed" + "fieldset" "figcaption" "figure" "footer" "form" "frame" "frameset" "h1" "h2" "h3" "h4" "h5" + "h6" "head" "header" "hgroup" "hr" "html" "iframe" "img" "input" "isindex" "li" "link" + "listing" "main" "marquee" "menu" "menuitem" "meta" "nav" "noembed" "noframes" "noscript" + "object" "ol" "p" "param" "plaintext" "pre" "script" "section" "select" "source" "style" + "summary" "table" "tbody" "td" "template" "textarea" "tfoot" "th" "thead" "title" "tr" "track" + "ul" "wbr" "xmp"); +//§ END + +pub fn mathml_text_integration_point(p: QualName) -> bool { + matches!(p, qualname!(mathml, "mi") | qualname!(mathml, "mo") | qualname!(mathml, "mn") + | qualname!(mathml, "ms") | qualname!(mathml, "mtext")) +} + +/// https://html.spec.whatwg.org/multipage/#html-integration-point +pub fn svg_html_integration_point(p: QualName) -> bool { + // annotation-xml are handle in another place + matches!(p, qualname!(svg, "foreignObject") | qualname!(svg, "desc") + | qualname!(svg, "title")) +} diff --git a/html5ever-2015-05-15/src/tree_builder/types.rs b/html5ever-2016-08-25/src/tree_builder/types.rs similarity index 92% rename from html5ever-2015-05-15/src/tree_builder/types.rs rename to html5ever-2016-08-25/src/tree_builder/types.rs index 5ed8ab12d..861350432 100644 --- a/html5ever-2015-05-15/src/tree_builder/types.rs +++ b/html5ever-2016-08-25/src/tree_builder/types.rs @@ -11,6 +11,8 @@ use tokenizer::Tag; +use tendril::StrTendril; + pub use self::InsertionMode::*; pub use self::SplitStatus::*; pub use self::Token::*; @@ -56,8 +58,8 @@ pub enum SplitStatus { #[derive(PartialEq, Eq, Clone, Debug)] pub enum Token { TagToken(Tag), - CommentToken(String), - CharacterTokens(SplitStatus, String), + CommentToken(StrTendril), + CharacterTokens(SplitStatus, StrTendril), NullCharacterToken, EOFToken, } @@ -65,7 +67,7 @@ pub enum Token { pub enum ProcessResult { Done, DoneAckSelfClosing, - SplitWhitespace(String), + SplitWhitespace(StrTendril), Reprocess(InsertionMode, Token), ReprocessForeign(Token), } diff --git a/html5ever-2015-05-15/src/util/smallcharset.rs b/html5ever-2016-08-25/src/util/smallcharset.rs similarity index 84% rename from html5ever-2015-05-15/src/util/smallcharset.rs rename to html5ever-2016-08-25/src/util/smallcharset.rs index 673fd2b37..11369a1b7 100644 --- a/html5ever-2015-05-15/src/util/smallcharset.rs +++ b/html5ever-2016-08-25/src/util/smallcharset.rs @@ -22,7 +22,7 @@ impl SmallCharSet { /// Count the number of bytes of characters at the beginning /// of `buf` which are not in the set. /// See `tokenizer::buffer_queue::pop_except_from`. - pub fn nonmember_prefix_len(&self, buf: &str) -> usize { + pub fn nonmember_prefix_len(&self, buf: &str) -> u32 { let mut n = 0; for b in buf.bytes() { if b >= 64 || !self.contains(b) { @@ -48,11 +48,11 @@ mod test { #[test] fn nonmember_prefix() { for &c in ['&', '\0'].iter() { - for x in 0 .. 48usize { - for y in 0 .. 48usize { - let mut s = repeat("x").take(x).collect::<String>(); + for x in 0 .. 48u32 { + for y in 0 .. 48u32 { + let mut s = repeat("x").take(x as usize).collect::<String>(); s.push(c); - s.push_str(&repeat("x").take(y).collect::<String>()); + s.push_str(&repeat("x").take(y as usize).collect::<String>()); let set = small_char_set!('&' '\0'); assert_eq!(x, set.nonmember_prefix_len(&s)); diff --git a/html5ever-2016-08-25/src/util/str.rs b/html5ever-2016-08-25/src/util/str.rs new file mode 100644 index 000000000..7b0adb5ba --- /dev/null +++ b/html5ever-2016-08-25/src/util/str.rs @@ -0,0 +1,58 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::fmt; + +pub fn to_escaped_string<T: fmt::Debug>(x: &T) -> String { + use std::fmt::Write; + + // FIXME: don't allocate twice + let mut buf = String::new(); + let _ = buf.write_fmt(format_args!("{:?}", x)); + buf.shrink_to_fit(); + buf.chars().flat_map(|c| c.escape_default()).collect() +} + +/// If `c` is an ASCII letter, return the corresponding lowercase +/// letter, otherwise None. +pub fn lower_ascii_letter(c: char) -> Option<char> { + match c { + 'a' ... 'z' => Some(c), + 'A' ... 'Z' => Some((c as u8 - b'A' + b'a') as char), + _ => None + } +} + +/// Is the character an ASCII alphanumeric character? +pub fn is_ascii_alnum(c: char) -> bool { + matches!(c, '0'...'9' | 'a'...'z' | 'A'...'Z') +} + +/// ASCII whitespace characters, as defined by +/// tree construction modes that treat them specially. +pub fn is_ascii_whitespace(c: char) -> bool { + matches!(c, '\t' | '\r' | '\n' | '\x0C' | ' ') +} + +#[cfg(test)] +#[allow(non_snake_case)] +mod test { + use super::{is_ascii_alnum, lower_ascii_letter}; + + test_eq!(lower_letter_a_is_a, lower_ascii_letter('a'), Some('a')); + test_eq!(lower_letter_A_is_a, lower_ascii_letter('A'), Some('a')); + test_eq!(lower_letter_symbol_is_None, lower_ascii_letter('!'), None); + test_eq!(lower_letter_nonascii_is_None, lower_ascii_letter('\u{a66e}'), None); + + test_eq!(is_alnum_a, is_ascii_alnum('a'), true); + test_eq!(is_alnum_A, is_ascii_alnum('A'), true); + test_eq!(is_alnum_1, is_ascii_alnum('1'), true); + test_eq!(is_not_alnum_symbol, is_ascii_alnum('!'), false); + test_eq!(is_not_alnum_nonascii, is_ascii_alnum('\u{a66e}'), false); +} diff --git a/html5ever-2015-05-15/test_util/src/lib.rs b/html5ever-2016-08-25/tests/foreach_html5lib_test/mod.rs similarity index 100% rename from html5ever-2015-05-15/test_util/src/lib.rs rename to html5ever-2016-08-25/tests/foreach_html5lib_test/mod.rs diff --git a/html5ever-2016-08-25/tests/serializer.rs b/html5ever-2016-08-25/tests/serializer.rs new file mode 100644 index 000000000..4df6b45e8 --- /dev/null +++ b/html5ever-2016-08-25/tests/serializer.rs @@ -0,0 +1,109 @@ +// Copyright 2015 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#[macro_use] extern crate string_cache; +extern crate tendril; +extern crate html5ever; + +use std::default::Default; + +use tendril::{StrTendril, SliceExt, TendrilSink}; + +use html5ever::driver::ParseOpts; +use html5ever::{parse_fragment, parse_document, serialize}; +use html5ever::rcdom::RcDom; + +fn parse_and_serialize(input: StrTendril) -> StrTendril { + let dom = parse_fragment( + RcDom::default(), ParseOpts::default(), qualname!(html, "body"), vec![] + ).one(input); + let inner = &dom.document.borrow().children[0]; + + let mut result = vec![]; + serialize(&mut result, inner, Default::default()).unwrap(); + StrTendril::try_from_byte_slice(&result).unwrap() +} + +macro_rules! test { + ($name:ident, $input:expr, $output:expr) => { + #[test] + fn $name() { + assert_eq!($output, &*parse_and_serialize($input.to_tendril())); + } + }; + + // Shorthand for $output = $input + ($name:ident, $input:expr) => { + test!($name, $input, $input); + }; +} + +test!(empty, r#""#); +test!(smoke_test, r#"<p><i>Hello</i>, World!</p>"#); + +test!(misnest, r#"<p><i>Hello!</p>, World!</i>"#, + r#"<p><i>Hello!</i></p><i>, World!</i>"#); + +test!(attr_literal, r#"<base foo="<'>">"#); +test!(attr_escape_amp, r#"<base foo="&amp;">"#); +test!(attr_escape_amp_2, r#"<base foo=&amp>"#, r#"<base foo="&amp;">"#); +test!(attr_escape_nbsp, "<base foo=x\u{a0}y>", r#"<base foo="x&nbsp;y">"#); +test!(attr_escape_quot, r#"<base foo='"'>"#, r#"<base foo="&quot;">"#); +test!(attr_escape_several, r#"<span foo=3 title='test "with" &amp;quot;'>"#, + r#"<span foo="3" title="test &quot;with&quot; &amp;quot;"></span>"#); + +test!(text_literal, r#"<p>"'"</p>"#); +test!(text_escape_amp, r#"<p>&amp;</p>"#); +test!(text_escape_amp_2, r#"<p>&amp</p>"#, r#"<p>&amp;</p>"#); +test!(text_escape_nbsp, "<p>x\u{a0}y</p>", r#"<p>x&nbsp;y</p>"#); +test!(text_escape_lt, r#"<p>&lt;</p>"#); +test!(text_escape_gt, r#"<p>&gt;</p>"#); +test!(text_escape_gt2, r#"<p>></p>"#, r#"<p>&gt;</p>"#); + +test!(script_literal, r#"<script>(x & 1) < 2; y > "foo" + 'bar'</script>"#); +test!(style_literal, r#"<style>(x & 1) < 2; y > "foo" + 'bar'</style>"#); +test!(xmp_literal, r#"<xmp>(x & 1) < 2; y > "foo" + 'bar'</xmp>"#); +test!(iframe_literal, r#"<iframe>(x & 1) < 2; y > "foo" + 'bar'</iframe>"#); +test!(noembed_literal, r#"<noembed>(x & 1) < 2; y > "foo" + 'bar'</noembed>"#); +test!(noframes_literal, r#"<noframes>(x & 1) < 2; y > "foo" + 'bar'"#); + +test!(pre_lf_0, "
foo bar
"); +test!(pre_lf_1, "
\nfoo bar
", "
foo bar
"); +test!(pre_lf_2, "
\n\nfoo bar
"); + +test!(textarea_lf_0, ""); +test!(textarea_lf_1, "", ""); +test!(textarea_lf_2, ""); + +test!(listing_lf_0, "foo bar"); +test!(listing_lf_1, "\nfoo bar", "foo bar"); +test!(listing_lf_2, "\n\nfoo bar"); + +test!(comment_1, r#"

hi

"#); +test!(comment_2, r#"

hi

"#); +test!(comment_3, r#"

hi

"#); +test!(comment_4, r#"

hi

"#); + +// FIXME: test serialization of qualified tag/attribute names that can't be +// parsed from HTML + +test!(attr_ns_1, r#""#); +test!(attr_ns_2, r#""#); +test!(attr_ns_3, r#""#); +test!(attr_ns_4, r#""#); + +#[test] +fn doctype() { + let dom = parse_document( + RcDom::default(), ParseOpts::default()).one(""); + dom.document.borrow_mut().children.truncate(1); // Remove + let mut result = vec![]; + serialize(&mut result, &dom.document, Default::default()).unwrap(); + assert_eq!(String::from_utf8(result).unwrap(), ""); +} diff --git a/html5ever-2016-08-25/tests/tokenizer.rs b/html5ever-2016-08-25/tests/tokenizer.rs new file mode 100644 index 000000000..2faebfe28 --- /dev/null +++ b/html5ever-2016-08-25/tests/tokenizer.rs @@ -0,0 +1,421 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate rustc_serialize; +#[macro_use] extern crate string_cache; +extern crate tendril; +extern crate test; + +extern crate html5ever; + +mod foreach_html5lib_test; +use foreach_html5lib_test::foreach_html5lib_test; + +use std::{char, env}; +use std::ffi::OsStr; +use std::mem::replace; +use std::default::Default; +use std::path::Path; +use test::{TestDesc, TestDescAndFn, DynTestName, DynTestFn}; +use test::ShouldPanic::No; +use rustc_serialize::json::Json; +use std::collections::BTreeMap; +use std::borrow::Cow::Borrowed; + +use html5ever::tokenizer::{Doctype, Attribute, StartTag, EndTag, Tag}; +use html5ever::tokenizer::{Token, DoctypeToken, TagToken, CommentToken}; +use html5ever::tokenizer::{CharacterTokens, NullCharacterToken, EOFToken, ParseError}; +use html5ever::tokenizer::{TokenSink, Tokenizer, TokenizerOpts}; +use html5ever::tokenizer::states::{Plaintext, RawData, Rcdata, Rawtext}; + +use string_cache::{Atom, QualName}; +use tendril::{StrTendril, SliceExt}; + +// Return all ways of splitting the string into at most n +// possibly-empty pieces. +fn splits(s: &str, n: usize) -> Vec> { + if n == 1 { + return vec!(vec!(s.to_tendril())); + } + + let mut points: Vec = s.char_indices().map(|(n,_)| n).collect(); + points.push(s.len()); + + // do this with iterators? + let mut out = vec!(); + for p in points.into_iter() { + let y = &s[p..]; + for mut x in splits(&s[..p], n-1).into_iter() { + x.push(y.to_tendril()); + out.push(x); + } + } + + out.extend(splits(s, n-1).into_iter()); + out +} + +struct TokenLogger { + tokens: Vec, + current_str: StrTendril, + exact_errors: bool, +} + +impl TokenLogger { + fn new(exact_errors: bool) -> TokenLogger { + TokenLogger { + tokens: vec!(), + current_str: StrTendril::new(), + exact_errors: exact_errors, + } + } + + // Push anything other than character tokens + fn push(&mut self, token: Token) { + self.finish_str(); + self.tokens.push(token); + } + + fn finish_str(&mut self) { + if self.current_str.len() > 0 { + let s = replace(&mut self.current_str, StrTendril::new()); + self.tokens.push(CharacterTokens(s)); + } + } + + fn get_tokens(mut self) -> Vec { + self.finish_str(); + self.tokens + } +} + +impl TokenSink for TokenLogger { + fn process_token(&mut self, token: Token) { + match token { + CharacterTokens(b) => { + self.current_str.push_slice(&b); + } + + NullCharacterToken => { + self.current_str.push_char('\0'); + } + + ParseError(_) => if self.exact_errors { + self.push(ParseError(Borrowed(""))); + }, + + TagToken(mut t) => { + // The spec seems to indicate that one can emit + // erroneous end tags with attrs, but the test + // cases don't contain them. + match t.kind { + EndTag => { + t.self_closing = false; + t.attrs = vec!(); + } + _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), + } + self.push(TagToken(t)); + } + + EOFToken => (), + + _ => self.push(token), + } + } +} + +fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec { + let sink = TokenLogger::new(opts.exact_errors); + let mut tok = Tokenizer::new(sink, opts); + for chunk in input.into_iter() { + tok.feed(chunk); + } + tok.end(); + tok.unwrap().get_tokens() +} + +trait JsonExt: Sized { + fn get_str(&self) -> String; + fn get_tendril(&self) -> StrTendril; + fn get_nullable_tendril(&self) -> Option; + fn get_bool(&self) -> bool; + fn get_obj<'t>(&'t self) -> &'t BTreeMap; + fn get_list<'t>(&'t self) -> &'t Vec; + fn find<'t>(&'t self, key: &str) -> &'t Self; +} + +impl JsonExt for Json { + fn get_str(&self) -> String { + match *self { + Json::String(ref s) => s.to_string(), + _ => panic!("Json::get_str: not a String"), + } + } + + fn get_tendril(&self) -> StrTendril { + match *self { + Json::String(ref s) => s.to_tendril(), + _ => panic!("Json::get_tendril: not a String"), + } + } + + fn get_nullable_tendril(&self) -> Option { + match *self { + Json::Null => None, + Json::String(ref s) => Some(s.to_tendril()), + _ => panic!("Json::get_nullable_tendril: not a String"), + } + } + + fn get_bool(&self) -> bool { + match *self { + Json::Boolean(b) => b, + _ => panic!("Json::get_bool: not a Boolean"), + } + } + + fn get_obj<'t>(&'t self) -> &'t BTreeMap { + match *self { + Json::Object(ref m) => &*m, + _ => panic!("Json::get_obj: not an Object"), + } + } + + fn get_list<'t>(&'t self) -> &'t Vec { + match *self { + Json::Array(ref m) => m, + _ => panic!("Json::get_list: not an Array"), + } + } + + fn find<'t>(&'t self, key: &str) -> &'t Json { + self.get_obj().get(&key.to_string()).unwrap() + } +} + +// Parse a JSON object (other than "ParseError") to a token. +fn json_to_token(js: &Json) -> Token { + let parts = js.get_list(); + // Collect refs here so we don't have to use "ref" in all the patterns below. + let args: Vec<&Json> = parts[1..].iter().collect(); + match &*parts[0].get_str() { + "DOCTYPE" => DoctypeToken(Doctype { + name: args[0].get_nullable_tendril(), + public_id: args[1].get_nullable_tendril(), + system_id: args[2].get_nullable_tendril(), + force_quirks: !args[3].get_bool(), + }), + + "StartTag" => TagToken(Tag { + kind: StartTag, + name: Atom::from(&*args[0].get_str()), + attrs: args[1].get_obj().iter().map(|(k,v)| { + Attribute { + name: QualName::new(ns!(), Atom::from(&**k)), + value: v.get_tendril() + } + }).collect(), + self_closing: match args.get(2) { + Some(b) => b.get_bool(), + None => false, + } + }), + + "EndTag" => TagToken(Tag { + kind: EndTag, + name: Atom::from(&*args[0].get_str()), + attrs: vec!(), + self_closing: false + }), + + "Comment" => CommentToken(args[0].get_tendril()), + + "Character" => CharacterTokens(args[0].get_tendril()), + + // We don't need to produce NullCharacterToken because + // the TokenLogger will convert them to CharacterTokens. + + _ => panic!("don't understand token {:?}", parts), + } +} + +// Parse the "output" field of the test case into a vector of tokens. +fn json_to_tokens(js: &Json, exact_errors: bool) -> Vec { + // Use a TokenLogger so that we combine character tokens separated + // by an ignored error. + let mut sink = TokenLogger::new(exact_errors); + for tok in js.get_list().iter() { + match *tok { + Json::String(ref s) + if &s[..] == "ParseError" => sink.process_token(ParseError(Borrowed(""))), + _ => sink.process_token(json_to_token(tok)), + } + } + sink.get_tokens() +} + +// Undo the escaping in "doubleEscaped" tests. +fn unescape(s: &str) -> Option { + let mut out = String::with_capacity(s.len()); + let mut it = s.chars().peekable(); + loop { + match it.next() { + None => return Some(out), + Some('\\') => { + if it.peek() != Some(&'u') { + panic!("can't understand escape"); + } + drop(it.next()); + let hex: String = it.by_ref().take(4).collect(); + match u32::from_str_radix(&hex, 16).ok() + .and_then(char::from_u32) { + // Some of the tests use lone surrogates, but we have no + // way to represent them in the UTF-8 input to our parser. + // Since these can only come from script, we will catch + // them there. + None => return None, + Some(c) => out.push(c), + } + } + Some(c) => out.push(c), + } + } +} + +fn unescape_json(js: &Json) -> Json { + match *js { + // unwrap is OK here because the spec'd *output* of the tokenizer never + // contains a lone surrogate. + Json::String(ref s) => Json::String(unescape(&s).unwrap()), + Json::Array(ref xs) => Json::Array(xs.iter().map(unescape_json).collect()), + Json::Object(ref obj) => { + let mut new_obj = BTreeMap::new(); + for (k,v) in obj.iter() { + new_obj.insert(k.clone(), unescape_json(v)); + } + Json::Object(new_obj) + } + _ => js.clone(), + } +} + +fn mk_test(desc: String, input: String, expect: Vec, opts: TokenizerOpts) + -> TestDescAndFn { + TestDescAndFn { + desc: TestDesc { + name: DynTestName(desc), + ignore: false, + should_panic: No, + }, + testfn: DynTestFn(Box::new(move || { + // Split up the input at different points to test incremental tokenization. + let insplits = splits(&input, 3); + for input in insplits.into_iter() { + // Clone 'input' so we have it for the failure message. + // Also clone opts. If we don't, we get the wrong + // result but the compiler doesn't catch it! + // Possibly mozilla/rust#12223. + let output = tokenize(input.clone(), opts.clone()); + if output != expect { + panic!("\ninput: {:?}\ngot: {:?}\nexpected: {:?}", + input, output, expect); + } + } + })), + } +} + +fn mk_tests(tests: &mut Vec, filename: &str, js: &Json) { + let obj = js.get_obj(); + let mut input = js.find("input").unwrap().get_str(); + let mut expect = js.find("output").unwrap().clone(); + let desc = format!("tok: {}: {}", + filename, js.find("description").unwrap().get_str()); + + // "Double-escaped" tests require additional processing of + // the input and output. + if obj.get(&"doubleEscaped".to_string()).map_or(false, |j| j.get_bool()) { + match unescape(&input) { + None => return, + Some(i) => input = i, + } + expect = unescape_json(&expect); + } + + // Some tests have a last start tag name. + let start_tag = obj.get(&"lastStartTag".to_string()).map(|s| s.get_str()); + + // Some tests want to start in a state other than Data. + let state_overrides = match obj.get(&"initialStates".to_string()) { + Some(&Json::Array(ref xs)) => xs.iter().map(|s| + Some(match &s.get_str()[..] { + "PLAINTEXT state" => Plaintext, + "RAWTEXT state" => RawData(Rawtext), + "RCDATA state" => RawData(Rcdata), + s => panic!("don't know state {}", s), + })).collect(), + None => vec!(None), + _ => panic!("don't understand initialStates value"), + }; + + // Build the tests. + for state in state_overrides.into_iter() { + for &exact_errors in [false, true].iter() { + let mut newdesc = desc.clone(); + match state { + Some(s) => newdesc = format!("{} (in state {:?})", newdesc, s), + None => (), + }; + if exact_errors { + newdesc = format!("{} (exact errors)", newdesc); + } + + let expect_toks = json_to_tokens(&expect, exact_errors); + tests.push(mk_test(newdesc, input.clone(), expect_toks, TokenizerOpts { + exact_errors: exact_errors, + initial_state: state, + last_start_tag_name: start_tag.clone(), + + // Not discarding a BOM is what the test suite expects; see + // https://github.com/html5lib/html5lib-tests/issues/2 + discard_bom: false, + + .. Default::default() + })); + } + } +} + +fn tests(src_dir: &Path) -> Vec { + let mut tests = vec!(); + + foreach_html5lib_test(src_dir, "tokenizer", + OsStr::new("test"), |path, mut file| { + let js = Json::from_reader(&mut file).ok().expect("json parse error"); + + match js.get_obj().get(&"tests".to_string()) { + Some(&Json::Array(ref lst)) => { + for test in lst.iter() { + mk_tests(&mut tests, path.file_name().unwrap().to_str().unwrap(), test); + } + } + + // xmlViolation.test doesn't follow this format. + _ => (), + } + }); + + tests +} + +fn main() { + let args: Vec<_> = env::args().collect(); + test::test_main(&args, tests(Path::new(env!("CARGO_MANIFEST_DIR")))); +} diff --git a/html5ever-2016-08-25/tests/tree_builder.rs b/html5ever-2016-08-25/tests/tree_builder.rs new file mode 100644 index 000000000..527512c0b --- /dev/null +++ b/html5ever-2016-08-25/tests/tree_builder.rs @@ -0,0 +1,284 @@ +// Copyright 2014 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate test; +#[macro_use] extern crate string_cache; +extern crate tendril; + +extern crate html5ever; + +mod foreach_html5lib_test; +use foreach_html5lib_test::foreach_html5lib_test; + +use std::{fs, io, env}; +use std::io::BufRead; +use std::ffi::OsStr; +use std::iter::repeat; +use std::mem::replace; +use std::default::Default; +use std::path::Path; +use std::collections::{HashSet, HashMap}; +use test::{TestDesc, TestDescAndFn, DynTestName, TestFn}; +use test::ShouldPanic::No; + +use html5ever::{ParseOpts, parse_document, parse_fragment}; +use html5ever::rcdom::{Comment, Document, Doctype, Element, Handle, RcDom}; +use html5ever::rcdom::{Template, Text}; + +use string_cache::{Atom, QualName}; +use tendril::{StrTendril, TendrilSink}; + +fn parse_tests>(mut lines: It) -> Vec> { + let mut tests = vec!(); + let mut test = HashMap::new(); + let mut key: Option = None; + let mut val = String::new(); + + macro_rules! finish_val ( () => ( + match key.take() { + None => (), + Some(key) => { + assert!(test.insert(key, replace(&mut val, String::new())).is_none()); + } + } + )); + + macro_rules! finish_test ( () => ( + if !test.is_empty() { + tests.push(replace(&mut test, HashMap::new())); + } + )); + + loop { + match lines.next() { + None => break, + Some(line) => { + if line.starts_with("#") { + finish_val!(); + if line == "#data" { + finish_test!(); + } + key = Some(line[1..].to_string()); + } else { + val.push_str(&line); + val.push('\n'); + } + } + } + } + + finish_val!(); + finish_test!(); + tests +} + +fn serialize(buf: &mut String, indent: usize, handle: Handle) { + buf.push_str("|"); + buf.push_str(&repeat(" ").take(indent).collect::()); + + let node = handle.borrow(); + match node.node { + Document => panic!("should not reach Document"), + + Doctype(ref name, ref public, ref system) => { + buf.push_str("\n"); + } + + Text(ref text) => { + buf.push_str("\""); + buf.push_str(&text); + buf.push_str("\"\n"); + } + + Comment(ref text) => { + buf.push_str("\n"); + } + + Element(ref name, _, ref attrs) => { + buf.push_str("<"); + match name.ns { + ns!(svg) => buf.push_str("svg "), + ns!(mathml) => buf.push_str("math "), + _ => (), + } + buf.push_str(&*name.local); + buf.push_str(">\n"); + + let mut attrs = attrs.clone(); + attrs.sort_by(|x, y| x.name.local.cmp(&y.name.local)); + // FIXME: sort by UTF-16 code unit + + for attr in attrs.into_iter() { + buf.push_str("|"); + buf.push_str(&repeat(" ").take(indent+2).collect::()); + match attr.name.ns { + ns!(xlink) => buf.push_str("xlink "), + ns!(xml) => buf.push_str("xml "), + ns!(xmlns) => buf.push_str("xmlns "), + _ => (), + } + buf.push_str(&format!("{}=\"{}\"\n", + attr.name.local, attr.value)); + } + } + } + + for child in node.children.iter() { + serialize(buf, indent+2, child.clone()); + } + + if let Element(_, Template(ref content), _) = node.node { + buf.push_str("|"); + buf.push_str(&repeat(" ").take(indent+2).collect::()); + buf.push_str("content\n"); + for child in &content.borrow().children { + serialize(buf, indent+4, child.clone()); + } + } +} + +fn make_test( + tests: &mut Vec, + ignores: &HashSet, + filename: &str, + idx: usize, + fields: HashMap) { + + let scripting_flags = &[false, true]; + let scripting_flags = if fields.contains_key("script-off") { + &scripting_flags[0..1] + } else if fields.contains_key("script-on") { + &scripting_flags[1..2] + } else { + &scripting_flags[0..2] + }; + let name = format!("tb: {}-{}", filename, idx); + for scripting_enabled in scripting_flags { + let test = make_test_desc_with_scripting_flag( + ignores, &name, &fields, *scripting_enabled); + tests.push(test); + } +} + +fn make_test_desc_with_scripting_flag( + ignores: &HashSet, + name: &str, + fields: &HashMap, + scripting_enabled: bool) + -> TestDescAndFn { + let get_field = |key| { + let field = fields.get(key).expect("missing field"); + field.trim_right_matches('\n').to_string() + }; + + let mut data = fields.get("data").expect("missing data").to_string(); + data.pop(); + let expected = get_field("document"); + let context = fields.get("document-fragment") + .map(|field| context_name(field.trim_right_matches('\n'))); + let ignore = ignores.contains(name); + let mut name = name.to_owned(); + if scripting_enabled { + name.push_str(" (scripting enabled)"); + } else { + name.push_str(" (scripting disabled)"); + }; + let mut opts: ParseOpts = Default::default(); + opts.tree_builder.scripting_enabled = scripting_enabled; + + TestDescAndFn { + desc: TestDesc { + name: DynTestName(name), + ignore: ignore, + should_panic: No, + }, + testfn: TestFn::dyn_test_fn(move || { + // Do this here because Tendril isn't Send. + let data = StrTendril::from_slice(&data); + let mut result = String::new(); + match context { + None => { + let dom = parse_document(RcDom::default(), opts).one(data.clone()); + for child in dom.document.borrow().children.iter() { + serialize(&mut result, 1, child.clone()); + } + }, + Some(ref context) => { + let dom = parse_fragment(RcDom::default(), opts, context.clone(), vec![]) + .one(data.clone()); + // fragment case: serialize children of the html element + // rather than children of the document + let doc = dom.document.borrow(); + let root = doc.children[0].borrow(); + for child in root.children.iter() { + serialize(&mut result, 1, child.clone()); + } + }, + }; + let len = result.len(); + result.truncate(len - 1); // drop the trailing newline + + if result != expected { + panic!("\ninput: {}\ngot:\n{}\nexpected:\n{}\n", + data, result, expected); + } + }), + } +} + +fn context_name(context: &str) -> QualName { + if context.starts_with("svg ") { + QualName::new(ns!(svg), Atom::from(&context[4..])) + } else if context.starts_with("math ") { + QualName::new(ns!(mathml), Atom::from(&context[5..])) + } else { + QualName::new(ns!(html), Atom::from(context)) + } +} + +fn tests(src_dir: &Path, ignores: &HashSet) -> Vec { + let mut tests = vec!(); + + foreach_html5lib_test(src_dir, "tree-construction", + OsStr::new("dat"), |path, file| { + let buf = io::BufReader::new(file); + let lines = buf.lines() + .map(|res| res.ok().expect("couldn't read")); + let data = parse_tests(lines); + + for (i, test) in data.into_iter().enumerate() { + make_test(&mut tests, ignores, path.file_name().unwrap().to_str().unwrap(), + i, test); + } + }); + + tests +} + +fn main() { + let args: Vec<_> = env::args().collect(); + let src_dir = Path::new(env!("CARGO_MANIFEST_DIR")); + let mut ignores = HashSet::new(); + { + let f = fs::File::open(&src_dir.join("data/test/ignore")).unwrap(); + let r = io::BufReader::new(f); + for ln in r.lines() { + ignores.insert(ln.unwrap().trim_right().to_string()); + } + } + + test::test_main(&args, tests(src_dir, &ignores)); +}