Skip to content

Add Python Implementation of Huffman Encoding #98

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 1, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions chapters/data_compression/huffman/code/python/huffman.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# Huffman Encoding
# Python 2.7+
# Submitted by Matthew Giallourakis

from collections import Counter

# constructs the tree
def build_huffman_tree(message):

# get sorted list of character and frequency pairs
frequencies = Counter(message)
trees = frequencies.most_common()

# while there is more than one tree
while len(trees) > 1:

# pop off the two trees of least weight from the trees list
tree_left,weight_left = trees.pop()
tree_right,weight_right = trees.pop()

# combine the nodes and add back to the nodes list
new_tree = [tree_left, tree_right]
new_weight = weight_left + weight_right

# find the first tree that has a weight smaller than new_weight and returns its index in the list
# If no such tree can be found, use len(trees) instead to append
index = next((i for i, tree in enumerate(trees) if tree[1] < new_weight), len(trees))

# insert the new tree there
trees.insert(index, (new_tree, new_weight))

huffman_tree = trees[0][0]
return huffman_tree

# constructs the mapping with recursion
def build_codebook(tree, code=''):

codebook = []

# split the tree
left_tree, right_tree = tree

# if the left node has children, find the mapping of those children
# else pair the character with the current code + 0
if type(left_tree) is list:
codebook += build_codebook(left_tree, code+'0')
else:
codebook.append((left_tree, code+'0'))

# if the right node has children, find the mapping of those children
# else pair the character with the current code + 1
if type(right_tree) is list:
codebook += build_codebook(right_tree, code+'1')
else:
codebook.append((right_tree, code+'1'))
return codebook

# encodes the message
def huffman_encode(codebook, message):

encoded_message = ''

# build a char -> code dictionary
forward_dict = dict(codebook)

# replace each character with its code
for char in message:
encoded_message += forward_dict[char]

return encoded_message

# decodes a message
def huffman_decode(codebook, encoded_message):

decoded_message = ''
key = ''

# build a code -> char dictionary
inverse_dict = dict([(v, k) for k, v in codebook])

# for each bit in the encoding
# if the bit is in the dictionary, replace the bit with the paired character
# else look at the bit and the following bits together until a match occurs
# move to the next bit not yet looked at
for index, bit in enumerate(encoded_message):
key += bit
if key in inverse_dict:
decoded_message += inverse_dict[key]
key = ''

return decoded_message

def main():

# test example
message = 'bibbity_bobbity'
tree = build_huffman_tree(message)
codebook = build_codebook(tree)
encoded_message = huffman_encode(codebook, message)
decoded_message = huffman_decode(codebook, encoded_message)

print('message: ' + message)
print('huffman tree: ' + str(tree))
print('codebook: ' + str(codebook))
print('encoded message: ' + encoded_message)
print('decoded message: ' + decoded_message)

# prints the following:
#
# message: bibbity_bobbity
# huffman_tree: ['b', [[['_', 'o'], 'y'], ['t', 'i']]]
# codebook: [('b', '0'), ('_', '1000'), ('o', '1001'),
# ('y', '101'), ('t', '110'), ('i', '111')]
# encoded_message: 01110011111010110000100100111110101
# decoded_message: bibbity_bobbity

if __name__ == '__main__':
main()