Skip to content

Commit 26f04ed

Browse files
authored
Merge pull request #221 from rust-scraper/sorted-vec-instead-of-hash-table
RFC: Drop hash table for per-element attributes for more compact sorted vector
2 parents 8d3e74b + ee66ee8 commit 26f04ed

File tree

4 files changed

+64
-63
lines changed

4 files changed

+64
-63
lines changed

Cargo.lock

Lines changed: 26 additions & 46 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scraper/Cargo.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,10 @@ repository = "https://github.com/causal-agent/scraper"
1313
readme = "README.md"
1414

1515
[dependencies]
16-
ahash = "0.8.0"
1716
cssparser = "0.34.0"
1817
ego-tree = "0.9.0"
1918
html5ever = "0.29.0"
20-
indexmap = { version = "2.6.0", optional = true }
19+
indexmap = { version = "2.7.0", optional = true }
2120
precomputed-hash = "0.1.1"
2221
selectors = "0.26.0"
2322
tendril = "0.4.3"

scraper/src/html/tree_sink.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,17 @@ impl TreeSink for HtmlTreeSink {
223223
};
224224

225225
for attr in attrs {
226+
#[cfg(not(feature = "deterministic"))]
227+
if let Err(idx) = element
228+
.attrs
229+
.binary_search_by(|(name, _)| name.cmp(&attr.name))
230+
{
231+
element
232+
.attrs
233+
.insert(idx, (attr.name, make_tendril(attr.value)));
234+
}
235+
236+
#[cfg(feature = "deterministic")]
226237
element
227238
.attrs
228239
.entry(attr.name)

scraper/src/node.rs

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
//! HTML nodes.
22
3-
#[cfg(not(feature = "deterministic"))]
4-
use ahash::AHashMap as HashMap;
5-
#[cfg(not(feature = "deterministic"))]
6-
use std::collections::hash_map;
73
use std::fmt;
84
use std::ops::Deref;
95
use std::slice::Iter as SliceIter;
@@ -219,7 +215,7 @@ pub type Attributes = indexmap::IndexMap<QualName, StrTendril>;
219215
/// Please enable the `deterministic` feature for order-preserving
220216
/// (de)serialization.
221217
#[cfg(not(feature = "deterministic"))]
222-
pub type Attributes = HashMap<QualName, StrTendril>;
218+
pub type Attributes = Vec<(QualName, StrTendril)>;
223219

224220
/// An HTML element.
225221
#[derive(Clone, PartialEq, Eq)]
@@ -232,16 +228,20 @@ pub struct Element {
232228

233229
id: OnceCell<Option<StrTendril>>,
234230

235-
classes: OnceCell<Vec<LocalName>>,
231+
classes: OnceCell<Box<[LocalName]>>,
236232
}
237233

238234
impl Element {
239235
#[doc(hidden)]
240236
pub fn new(name: QualName, attributes: Vec<Attribute>) -> Self {
241-
let attrs = attributes
237+
#[allow(unused_mut)]
238+
let mut attrs = attributes
242239
.into_iter()
243-
.map(|a| (a.name, crate::tendril_util::make(a.value)))
244-
.collect();
240+
.map(|attr| (attr.name, crate::tendril_util::make(attr.value)))
241+
.collect::<Attributes>();
242+
243+
#[cfg(not(feature = "deterministic"))]
244+
attrs.sort_unstable_by(|lhs, rhs| lhs.0.cmp(&rhs.0));
245245

246246
Element {
247247
attrs,
@@ -277,17 +277,17 @@ impl Element {
277277
/// Returns an iterator over the element's classes.
278278
pub fn classes(&self) -> Classes {
279279
let classes = self.classes.get_or_init(|| {
280-
let mut classes: Vec<LocalName> = self
280+
let mut classes = self
281281
.attrs
282282
.iter()
283283
.filter(|(name, _)| name.local.as_ref() == "class")
284-
.flat_map(|(_, value)| value.split_whitespace().map(LocalName::from))
285-
.collect();
284+
.flat_map(|(_, value)| value.split_ascii_whitespace().map(LocalName::from))
285+
.collect::<Vec<_>>();
286286

287287
classes.sort_unstable();
288288
classes.dedup();
289289

290-
classes
290+
classes.into_boxed_slice()
291291
});
292292

293293
Classes {
@@ -298,7 +298,18 @@ impl Element {
298298
/// Returns the value of an attribute.
299299
pub fn attr(&self, attr: &str) -> Option<&str> {
300300
let qualname = QualName::new(None, ns!(), LocalName::from(attr));
301-
self.attrs.get(&qualname).map(Deref::deref)
301+
302+
#[cfg(not(feature = "deterministic"))]
303+
let value = self
304+
.attrs
305+
.binary_search_by(|attr| attr.0.cmp(&qualname))
306+
.ok()
307+
.map(|idx| &*self.attrs[idx].1);
308+
309+
#[cfg(feature = "deterministic")]
310+
let value = self.attrs.get(&qualname).map(Deref::deref);
311+
312+
value
302313
}
303314

304315
/// Returns an iterator over the element's attributes.
@@ -330,7 +341,7 @@ pub type AttributesIter<'a> = indexmap::map::Iter<'a, QualName, StrTendril>;
330341

331342
/// An iterator over a node's attributes.
332343
#[cfg(not(feature = "deterministic"))]
333-
pub type AttributesIter<'a> = hash_map::Iter<'a, QualName, StrTendril>;
344+
pub type AttributesIter<'a> = SliceIter<'a, (QualName, StrTendril)>;
334345

335346
/// Iterator over attributes.
336347
#[allow(missing_debug_implementations)]

0 commit comments

Comments
 (0)