| 1 | r""" |
| 2 | Huffman Encoding |
| 3 | """ |
| 4 | from string import join |
| 5 | |
| 6 | def frequency_table(string): |
| 7 | r""" |
| 8 | Return the frequency table corresponding to the given |
| 9 | string. |
| 10 | |
| 11 | INPUT: |
| 12 | |
| 13 | - ``string`` -- a string |
| 14 | |
| 15 | EXAMPLE:: |
| 16 | |
| 17 | sage: from sage.coding.source_coding import frequency_table |
| 18 | sage: str = "Sage is my most favorite general purpose computer algebra system" |
| 19 | sage: frequency_table(str) |
| 20 | {'a': 5, ' ': 9, 'c': 1, 'b': 1, 'e': 8, 'g': 3, 'f': 1, 'i': 2, 'm': 4, 's': 5, 'o': 4, 'n': 1, 'p': 3, 'S': 1, 'r': 5, 'u': 2, 't': 4, 'v': 1, 'y': 2, 'l': 2} |
| 21 | |
| 22 | """ |
| 23 | d = {} |
| 24 | for l in string: |
| 25 | d[l] = d.get(l,0) + 1 |
| 26 | |
| 27 | return d |
| 28 | |
| 29 | |
| 30 | class Huffman(): |
| 31 | r""" |
| 32 | Huffman Encoding |
| 33 | |
| 34 | This class implements the basic functionalities |
| 35 | of Huffman's encoding. |
| 36 | |
| 37 | It can build a Huffman code from a given string, or |
| 38 | from the information of a dictionary associating |
| 39 | to each key (the elements of the alphabet) a weight |
| 40 | (most of the time, a probability value or a number |
| 41 | of occurrences). For example :: |
| 42 | |
| 43 | sage: from sage.coding.source_coding import Huffman, frequency_table |
| 44 | sage: h1 = Huffman("There once was a french fry") |
| 45 | sage: for letter, code in h1.encoding_table().iteritems(): |
| 46 | ... print "'"+ letter + "' : " + code |
| 47 | 'a' : 0111 |
| 48 | ' ' : 00 |
| 49 | 'c' : 1010 |
| 50 | 'e' : 100 |
| 51 | 'f' : 1011 |
| 52 | 'h' : 1100 |
| 53 | 'o' : 11100 |
| 54 | 'n' : 1101 |
| 55 | 's' : 11101 |
| 56 | 'r' : 010 |
| 57 | 'T' : 11110 |
| 58 | 'w' : 11111 |
| 59 | 'y' : 0110 |
| 60 | |
| 61 | We could have obtained the same result by "training" the Huffman |
| 62 | code on the following table of frequency :: |
| 63 | |
| 64 | sage: ft = frequency_table("There once was a french fry"); ft |
| 65 | {'a': 2, ' ': 5, 'c': 2, 'e': 4, 'f': 2, 'h': 2, 'o': 1, 'n': 2, 's': 1, 'r': 3, 'T': 1, 'w': 1, 'y': 1} |
| 66 | sage: h2 = Huffman(frequencies = ft) |
| 67 | |
| 68 | Once ``h1`` has been trained, and hence possesses an encoding code, |
| 69 | it is possible to obtain the Huffman encoding of any string |
| 70 | (possibly the same) using this code:: |
| 71 | |
| 72 | sage: encoded = h1.encode("There once was a french fry"); encoded |
| 73 | '11110110010001010000111001101101010000111110111111010001110010110101001101101011000010110100110' |
| 74 | |
| 75 | Which can be decoded the following way:: |
| 76 | |
| 77 | sage: h1.decode(encoded) |
| 78 | 'There once was a french fry' |
| 79 | |
| 80 | Obviously, if we try to decode a string using a Huffman instance which |
| 81 | has been trained on a different sample (and hence has a different encoding |
| 82 | table), we are likely to get some random-looking string :: |
| 83 | |
| 84 | sage: h3 = Huffman("There once were two french fries") |
| 85 | sage: h3.decode(encoded) |
| 86 | ' wehnefetrhft ne ewrowrirTc' |
| 87 | |
| 88 | ... precisely what we deserved :-) |
| 89 | |
| 90 | INPUT: |
| 91 | |
| 92 | One among the following: |
| 93 | |
| 94 | - ``string`` -- a string from which the Huffman encoding should |
| 95 | be created |
| 96 | |
| 97 | - ``frequencies`` -- a dictionary associating its frequency or |
| 98 | its number of occurrences to each letter of the alphabet. |
| 99 | |
| 100 | """ |
| 101 | |
| 102 | def __init__(self, string = None, frequencies = None): |
| 103 | r""" |
| 104 | Constructor for Huffman |
| 105 | |
| 106 | INPUT: |
| 107 | |
| 108 | One among the following: |
| 109 | |
| 110 | - ``string`` -- a string from which the Huffman encoding should |
| 111 | be created |
| 112 | |
| 113 | - ``frequencies`` -- a dictionary associating its frequency or |
| 114 | its number of occurrences to each letter of the alphabet. |
| 115 | |
| 116 | EXAMPLE:: |
| 117 | |
| 118 | sage: from sage.coding.source_coding import Huffman |
| 119 | sage: str = "Sage is my most favorite general purpose computer algebra system" |
| 120 | sage: h = Huffman(str) |
| 121 | |
| 122 | If both arguments are supplied, an exception is raised :: |
| 123 | |
| 124 | sage: Huffman(string=str, frequencies={'a':8}) |
| 125 | Traceback (most recent call last): |
| 126 | ... |
| 127 | ValueError: Exactly one of `string` or `frequencies` parameters must be defined |
| 128 | |
| 129 | """ |
| 130 | |
| 131 | self._character_to_code = [] |
| 132 | |
| 133 | if sum([string is not None, frequencies is not None]) != 1: |
| 134 | raise ValueError("Exactly one of `string` or `frequencies` parameters must be defined") |
| 135 | |
| 136 | if string is not None: |
| 137 | self._build_code(frequency_table(string)) |
| 138 | elif frequencies is not None: |
| 139 | self._build_code(frequencies) |
| 140 | |
| 141 | def _build_code_from_tree(self, tree, d, prefix=''): |
| 142 | r""" |
| 143 | Builds the code corresponding to a given tree and prefix |
| 144 | |
| 145 | INPUT: |
| 146 | |
| 147 | - ``tree`` -- integer, or list of size `2` |
| 148 | |
| 149 | - ``d`` -- the dictionary to fill |
| 150 | |
| 151 | - ``prefix`` (string) -- binary string which is the prefix |
| 152 | of any element of the tree |
| 153 | |
| 154 | EXAMPLE:: |
| 155 | |
| 156 | sage: from sage.coding.source_coding import Huffman |
| 157 | sage: str = "Sage is my most favorite general purpose computer algebra system" |
| 158 | sage: h = Huffman(str) |
| 159 | sage: d = {} |
| 160 | sage: h._build_code_from_tree(h._tree, d) |
| 161 | |
| 162 | """ |
| 163 | try: |
| 164 | self._build_code_from_tree(tree[0], d, prefix=prefix+'0') |
| 165 | self._build_code_from_tree(tree[1], d, prefix=prefix+'1') |
| 166 | except TypeError: |
| 167 | d[tree] = prefix |
| 168 | |
| 169 | def _build_code(self, dic): |
| 170 | r""" |
| 171 | Returns a Huffman code for each one of the given elements. |
| 172 | |
| 173 | INPUT: |
| 174 | |
| 175 | - ``dic`` (dictionary) -- associates to each letter of the alphabet |
| 176 | a frequency or a number of occurrences. |
| 177 | |
| 178 | EXAMPLE:: |
| 179 | |
| 180 | sage: from sage.coding.source_coding import Huffman, frequency_table |
| 181 | sage: str = "Sage is my most favorite general purpose computer algebra system" |
| 182 | sage: h = Huffman(str) |
| 183 | sage: d = {} |
| 184 | sage: h._build_code(frequency_table(str)) |
| 185 | """ |
| 186 | |
| 187 | from heapq import heappush, heappop |
| 188 | |
| 189 | index = dic.items() |
| 190 | heap = [] |
| 191 | |
| 192 | for i,(e,w) in enumerate(index): |
| 193 | heappush(heap, (w, i) ) |
| 194 | |
| 195 | while len(heap)>=2: |
| 196 | (w1, i1) = heappop(heap) |
| 197 | (w2, i2) = heappop(heap) |
| 198 | heappush(heap, (w1+w2,[i1,i2])) |
| 199 | |
| 200 | |
| 201 | d = {} |
| 202 | self._tree = heap[0][1] |
| 203 | self._build_code_from_tree(self._tree, d) |
| 204 | self._index = dict([(i,e) for i,(e,w) in enumerate(index)]) |
| 205 | self._character_to_code = dict([(e,d[i]) for i,(e,w) in enumerate(index)]) |
| 206 | |
| 207 | |
| 208 | def encode(self, string): |
| 209 | r""" |
| 210 | Returns an encoding of the given string based |
| 211 | on the current encoding table |
| 212 | |
| 213 | INPUT: |
| 214 | |
| 215 | - ``string`` (string) |
| 216 | |
| 217 | EXAMPLE: |
| 218 | |
| 219 | This is how a string is encoded then decoded :: |
| 220 | |
| 221 | sage: from sage.coding.source_coding import Huffman |
| 222 | sage: str = "Sage is my most favorite general purpose computer algebra system" |
| 223 | sage: h = Huffman(str) |
| 224 | sage: encoded = h.encode(str); encoded |
| 225 | '00000110100010101011000011101010011100101010011011011100111101110010110100001011011111000001110101010001010110011010111111011001110100101000111110010011011100101011100000110001100101000101110101111101110110011000101011000111111101101111010010111001110100011' |
| 226 | sage: h.decode(encoded) |
| 227 | 'Sage is my most favorite general purpose computer algebra system' |
| 228 | |
| 229 | """ |
| 230 | if self._character_to_code: |
| 231 | return join(map(lambda x:self._character_to_code[x],string), '') |
| 232 | |
| 233 | |
| 234 | def decode(self, string): |
| 235 | r""" |
| 236 | Returns a decoded version of the given string |
| 237 | corresponding to the current encoding table. |
| 238 | |
| 239 | INPUT: |
| 240 | |
| 241 | - ``string`` (string) |
| 242 | |
| 243 | |
| 244 | EXAMPLE: |
| 245 | |
| 246 | This is how a string is encoded then decoded :: |
| 247 | |
| 248 | sage: from sage.coding.source_coding import Huffman |
| 249 | sage: str = "Sage is my most favorite general purpose computer algebra system" |
| 250 | sage: h = Huffman(str) |
| 251 | sage: encoded = h.encode(str); encoded |
| 252 | '00000110100010101011000011101010011100101010011011011100111101110010110100001011011111000001110101010001010110011010111111011001110100101000111110010011011100101011100000110001100101000101110101111101110110011000101011000111111101101111010010111001110100011' |
| 253 | sage: h.decode(encoded) |
| 254 | 'Sage is my most favorite general purpose computer algebra system' |
| 255 | |
| 256 | Of course, the string one tries to decode has to be a binary one. If |
| 257 | not, an exception is raised :: |
| 258 | |
| 259 | sage: h.decode('I clearly am not a binary string') |
| 260 | Traceback (most recent call last): |
| 261 | ... |
| 262 | ValueError: The given string does not only contain 0 and 1 |
| 263 | """ |
| 264 | chars = [] |
| 265 | tree = self._tree |
| 266 | index = self._index |
| 267 | for i in string: |
| 268 | |
| 269 | if i == '0': |
| 270 | tree = tree[0] |
| 271 | elif i == '1': |
| 272 | tree = tree[1] |
| 273 | else: |
| 274 | raise ValueError('The given string does not only contain 0 and 1') |
| 275 | |
| 276 | if not isinstance(tree,list): |
| 277 | chars.append(index[tree]) |
| 278 | tree = self._tree |
| 279 | |
| 280 | return join(chars, '') |
| 281 | |
| 282 | def encoding_table(self): |
| 283 | r""" |
| 284 | Returns the current encoding table |
| 285 | |
| 286 | OUTPUT: |
| 287 | |
| 288 | A dictionary associating its code to each trained letter of |
| 289 | the alphabet |
| 290 | |
| 291 | EXAMPLE:: |
| 292 | |
| 293 | sage: from sage.coding.source_coding import Huffman |
| 294 | sage: str = "Sage is my most favorite general purpose computer algebra system" |
| 295 | sage: h = Huffman(str) |
| 296 | sage: h.encoding_table() |
| 297 | {'S': '00000', 'a': '1101', ' ': '101', 'c': '110000', 'b': '110001', 'e': '010', 'g': '0001', 'f': '110010', 'i': '10000', 'm': '0011', 'l': '10011', 'o': '0110', 'n': '110011', 'p': '0010', 's': '1110', 'r': '1111', 'u': '10001', 't': '0111', 'v': '00001', 'y': '10010'} |
| 298 | """ |
| 299 | return self._character_to_code.copy() |
| 300 | |
| 301 | def tree(self): |
| 302 | r""" |
| 303 | Returns the Huffman tree corresponding to the current encoding |
| 304 | |
| 305 | OUTPUT: |
| 306 | |
| 307 | A tree |
| 308 | |
| 309 | EXAMPLE:: |
| 310 | |
| 311 | sage: from sage.coding.source_coding import Huffman |
| 312 | sage: str = "Sage is my most favorite general purpose computer algebra system" |
| 313 | sage: h = Huffman(str) |
| 314 | sage: T = h.tree(); T |
| 315 | Digraph on 39 vertices |
| 316 | sage: T.show(figsize=[20,20]) |
| 317 | """ |
| 318 | |
| 319 | from sage.graphs.digraph import DiGraph |
| 320 | g = DiGraph() |
| 321 | g.add_edges(self._generate_edges(self._tree)) |
| 322 | return g |
| 323 | |
| 324 | def _generate_edges(self, tree, father='', id=''): |
| 325 | if father=='': |
| 326 | u = 'root' |
| 327 | else: |
| 328 | u = father |
| 329 | try: |
| 330 | return self._generate_edges(tree[0], father=father+id, id='0') + \ |
| 331 | self._generate_edges(tree[1], father=father+id, id='1') + \ |
| 332 | ([(u, father+id)] if (father+id) != '' else []) |
| 333 | |
| 334 | except TypeError: |
| 335 | return [(u, self.decode(father+id)+' : '+(father+id))] |