Skip to content

Commit 80296bb

Browse files
committed
[stdlib] Optimize Set.subtracting(_:)
Use a temporary bitset to avoid hashing elements more than once, and to prevent rehashings during the creation of the result set. This leads to a speedup of about 0-4x, depending on the number of elements removed.
1 parent 8612f2f commit 80296bb

File tree

4 files changed

+62
-3
lines changed

4 files changed

+62
-3
lines changed

stdlib/public/core/Bitset.swift

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,3 +369,19 @@ extension _UnsafeBitset {
369369
}
370370
}
371371
}
372+
373+
extension _UnsafeBitset {
374+
@_alwaysEmitIntoClient
375+
@inline(__always)
376+
internal static func withTemporaryCopy<R>(
377+
of original: _UnsafeBitset,
378+
body: (_UnsafeBitset) throws -> R
379+
) rethrows -> R {
380+
try _withTemporaryUninitializedBitset(
381+
wordCount: original.wordCount
382+
) { bitset in
383+
bitset.words.initialize(from: original.words, count: original.wordCount)
384+
return try body(bitset)
385+
}
386+
}
387+
}

stdlib/public/core/HashTable.swift

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,17 @@ internal struct _HashTable {
5252
return _UnsafeBitset.wordCount(forCapacity: bucketCount)
5353
}
5454
}
55+
56+
/// Return a bitset representation of the occupied buckets in this table.
57+
///
58+
/// Note that if we have only a single partial word in the hash table's
59+
/// bitset, then its out-of-bounds bits are guaranteed to be all set. These
60+
/// filler bits are there to speed up finding holes -- they don't correspond
61+
/// to occupied buckets in the table.
62+
@_alwaysEmitIntoClient
63+
internal var bitset: _UnsafeBitset {
64+
_UnsafeBitset(words: words, wordCount: wordCount)
65+
}
5566
}
5667

5768
extension _HashTable {

stdlib/public/core/NativeSet.swift

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -657,4 +657,38 @@ extension _NativeSet {
657657
return true
658658
}
659659
}
660+
661+
@_alwaysEmitIntoClient
662+
internal __consuming func subtracting<S: Sequence>(_ other: S) -> _NativeSet
663+
where S.Element == Element {
664+
guard count > 0 else { return _NativeSet() }
665+
// Rather than directly creating a new set, calculate the difference in a
666+
// bitset first. This ensures we hash each element (in both sets) only once,
667+
// and that we'll have an exact count for the result set, preventing
668+
// rehashings during insertions.
669+
return _UnsafeBitset.withTemporaryCopy(of: hashTable.bitset) { difference in
670+
var remainingCount = self.count
671+
for element in other {
672+
let (bucket, found) = find(element)
673+
if found {
674+
if difference.uncheckedRemove(bucket.offset) {
675+
remainingCount -= 1
676+
if remainingCount == 0 { return _NativeSet() }
677+
}
678+
}
679+
}
680+
_internalInvariant(remainingCount > 0)
681+
if remainingCount == self.count { return self }
682+
let result = _NativeSet(capacity: remainingCount)
683+
for offset in difference {
684+
result._unsafeInsertNew(
685+
self.uncheckedElement(at: Bucket(offset: offset)))
686+
// The hash table can have set bits after the end of the bitmap.
687+
// Ignore them.
688+
remainingCount -= 1
689+
if remainingCount == 0 { break }
690+
}
691+
return result
692+
}
693+
}
660694
}

stdlib/public/core/Set.swift

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -903,9 +903,7 @@ extension Set: SetAlgebra {
903903
_ other: S
904904
) -> Set<Element>
905905
where S.Element == Element {
906-
var newSet = self
907-
newSet.subtract(other)
908-
return newSet
906+
return Set(_native: _variant.convertedToNative.subtracting(other))
909907
}
910908

911909
/// Removes the elements of the given sequence from the set.

0 commit comments

Comments
 (0)