-
Notifications
You must be signed in to change notification settings - Fork 48
Perceptual hash #150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Perceptual hash #150
Changes from 18 commits
Commits
Show all changes
20 commits
Select commit
Hold shift + click to select a range
f17f028
initial commit
bjlittle 5eb088b
rejig kernel api
bjlittle b5aa567
templates + consistency
bjlittle d467865
add phash high frequence factor
bjlittle 7bc9e02
trap missing marker
bjlittle ec4df13
tidy + inject kernel name to summary
bjlittle 3f7d113
extend summary dict
bjlittle 35789fd
ensure summary kernel entry populated
bjlittle c2186d6
inject source kernel to hash lib + checks
bjlittle 80bc924
fix test_pytest_mpl.py
bjlittle e325708
temp disable subtests
bjlittle 6c82386
fix tests
bjlittle 55c8c07
fix test
bjlittle 7a9cd87
update subtests module skip TODO
bjlittle 431e516
add KernelPHash tests
bjlittle b69d734
add KernelSHA256 tests
bjlittle 2b48aff
test --mpl-kernel=phash
bjlittle 7c35990
working phash lib
bjlittle fa630de
review action - hash library kernel
bjlittle 5958c51
review actions
bjlittle File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
""" | ||
This module contains the supported hashing kernel implementations. | ||
|
||
""" | ||
import hashlib | ||
from abc import ABC, abstractmethod | ||
|
||
import imagehash | ||
from PIL import Image | ||
|
||
#: The default hamming distance bit tolerance for "similar" imagehash hashes. | ||
DEFAULT_HAMMING_TOLERANCE = 4 | ||
|
||
#: The default imagehash hash size (N), resulting in a hash of N**2 bits. | ||
DEFAULT_HASH_SIZE = 16 | ||
|
||
#: Level of image detail (high) or structure (low) represented by phash . | ||
DEFAULT_HIGH_FREQUENCY_FACTOR = 4 | ||
|
||
#: Registered kernel names. | ||
KERNEL_PHASH = "phash" | ||
KERNEL_SHA256 = "sha256" | ||
|
||
__all__ = [ | ||
"DEFAULT_HAMMING_TOLERANCE", | ||
"DEFAULT_HASH_SIZE", | ||
"DEFAULT_HIGH_FREQUENCY_FACTOR", | ||
"KERNEL_PHASH", | ||
"KERNEL_SHA256", | ||
"KernelPHash", | ||
"KernelSHA256", | ||
"kernel_factory", | ||
] | ||
|
||
|
||
class Kernel(ABC): | ||
""" | ||
Kernel abstract base class (ABC) which defines a simple common kernel API. | ||
|
||
""" | ||
|
||
def __init__(self, plugin): | ||
# Containment of the plugin allows the kernel to cherry-pick required state. | ||
self._plugin = plugin | ||
bjlittle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
@abstractmethod | ||
def equivalent_hash(self, result, baseline, marker=None): | ||
""" | ||
Determine whether the kernel considers the provided result (actual) | ||
and baseline (expected) hashes as similar. | ||
|
||
Parameters | ||
---------- | ||
result : str | ||
The hash of the image generated by the test. | ||
baseline : str | ||
The hash of the baseline image. | ||
marker : pytest.Mark | ||
The test marker, which may contain kwarg options to be | ||
applied to the equivalence test. | ||
|
||
Returns | ||
------- | ||
bool | ||
Whether the result and baseline hashes are deemed similar. | ||
|
||
""" | ||
|
||
@abstractmethod | ||
def generate_hash(self, buffer): | ||
""" | ||
Computes the hash of the image from the in-memory/open byte stream | ||
buffer. | ||
|
||
Parameters | ||
---------- | ||
buffer : stream | ||
The in-memory/open byte stream of the image. | ||
|
||
Returns | ||
------- | ||
str | ||
The string representation (hexdigest) of the image hash. | ||
|
||
""" | ||
|
||
def update_status(self, message): | ||
""" | ||
Append the kernel status message to the provided message. | ||
|
||
Parameters | ||
---------- | ||
message : str | ||
The existing status message. | ||
|
||
Returns | ||
------- | ||
str | ||
The updated status message. | ||
|
||
""" | ||
return message | ||
|
||
def update_summary(self, summary): | ||
""" | ||
Refresh the image comparison summary with relevant kernel entries. | ||
|
||
Parameters | ||
---------- | ||
summary : dict | ||
Image comparison test report summary. | ||
|
||
Returns | ||
------- | ||
None | ||
|
||
""" | ||
# The "name" class property *must* be defined in derived child class. | ||
summary["kernel"] = self.name | ||
|
||
|
||
class KernelPHash(Kernel): | ||
""" | ||
Kernel that calculates a perceptual hash of an image for the | ||
specified hash size (N) and high frequency factor. | ||
|
||
Where the resultant perceptual hash will be composed of N**2 bits. | ||
|
||
""" | ||
|
||
name = KERNEL_PHASH | ||
|
||
def __init__(self, plugin): | ||
super().__init__(plugin) | ||
# Keep state of the equivalence result. | ||
self.equivalent = None | ||
# Keep state of hash hamming distance (whole number) result. | ||
self.hamming_distance = None | ||
# Value may be overridden by py.test marker kwarg. | ||
arg = self._plugin.hamming_tolerance | ||
self.hamming_tolerance = arg if arg is not None else DEFAULT_HAMMING_TOLERANCE | ||
# The hash-size (N) defines the resultant N**2 bits hash size. | ||
arg = self._plugin.hash_size | ||
self.hash_size = arg if arg is not None else DEFAULT_HASH_SIZE | ||
# The level of image detail (high freq) or structure (low freq) | ||
# represented in perceptual hash thru discrete cosine transform. | ||
arg = self._plugin.high_freq_factor | ||
self.high_freq_factor = ( | ||
arg if arg is not None else DEFAULT_HIGH_FREQUENCY_FACTOR | ||
) | ||
# py.test marker kwarg. | ||
self.option = "hamming_tolerance" | ||
|
||
def equivalent_hash(self, result, baseline, marker=None): | ||
if marker: | ||
value = marker.kwargs.get(self.option) | ||
if value is not None: | ||
# Override with the decorator marker value. | ||
self.hamming_tolerance = int(value) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess this behaviour is relevant to the discussion over on #154 |
||
# Convert string hexdigest hashes to imagehash.ImageHash instances. | ||
result = imagehash.hex_to_hash(result) | ||
baseline = imagehash.hex_to_hash(baseline) | ||
# Unlike cryptographic hashes, perceptual hashes can measure the | ||
# degree of "similarity" through hamming distance bit differences | ||
# between the hashes. | ||
try: | ||
self.hamming_distance = result - baseline | ||
self.equivalent = self.hamming_distance <= self.hamming_tolerance | ||
except TypeError: | ||
# imagehash won't compare hashes of different sizes, however | ||
# let's gracefully support this for use-ability. | ||
self.hamming_distance = None | ||
self.equivalent = False | ||
return self.equivalent | ||
|
||
def generate_hash(self, buffer): | ||
buffer.seek(0) | ||
data = Image.open(buffer) | ||
phash = imagehash.phash( | ||
data, hash_size=self.hash_size, highfreq_factor=self.high_freq_factor | ||
) | ||
return str(phash) | ||
|
||
def update_status(self, message): | ||
result = str() if message is None else str(message) | ||
# Only update the status message for non-equivalent hash comparisons. | ||
if self.equivalent is False: | ||
msg = ( | ||
f"Hash hamming distance of {self.hamming_distance} bits > " | ||
f"hamming tolerance of {self.hamming_tolerance} bits." | ||
) | ||
result = f"{message} {msg}" if len(result) else msg | ||
return result | ||
|
||
def update_summary(self, summary): | ||
super().update_summary(summary) | ||
summary["hamming_distance"] = self.hamming_distance | ||
summary["hamming_tolerance"] = self.hamming_tolerance | ||
|
||
|
||
class KernelSHA256(Kernel): | ||
""" | ||
A simple kernel that calculates a 256-bit cryptographic SHA hash | ||
of an image. | ||
|
||
""" | ||
|
||
name = KERNEL_SHA256 | ||
|
||
def equivalent_hash(self, result, baseline, marker=None): | ||
# Simple cryptographic hash binary comparison. Interpretation of | ||
# the comparison result is that the hashes are either identical or | ||
# not identical. For non-identical hashes, it is not possible to | ||
# determine a heuristic of hash "similarity" due to the nature of | ||
# cryptographic hashes. | ||
return result == baseline | ||
|
||
def generate_hash(self, buffer): | ||
buffer.seek(0) | ||
data = buffer.read() | ||
hasher = hashlib.sha256() | ||
hasher.update(data) | ||
return hasher.hexdigest() | ||
|
||
|
||
#: Registry of available hashing kernel factories. | ||
kernel_factory = { | ||
KernelPHash.name: KernelPHash, | ||
KernelSHA256.name: KernelSHA256, | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.