4 | | from string import join |
| 15 | |
| 16 | ########################################################################### |
| 17 | # Copyright (c) 2010 Nathann Cohen <nathann.cohen@gmail.com> |
| 18 | # |
| 19 | # This program is free software; you can redistribute it and/or modify |
| 20 | # it under the terms of the GNU General Public License as published by |
| 21 | # the Free Software Foundation; either version 2 of the License, or |
| 22 | # (at your option) any later version. |
| 23 | # |
| 24 | # This program is distributed in the hope that it will be useful, |
| 25 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 26 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 27 | # GNU General Public License for more details. |
| 28 | # |
| 29 | # http://www.gnu.org/licenses/ |
| 30 | ########################################################################### |
| 31 | |
| 32 | from sage.structure.sage_object import SageObject |
| 33 | |
| 34 | ########################################################################### |
| 35 | # |
| 36 | # Helper functions |
| 37 | # |
| 38 | ########################################################################### |
18 | | sage: str = "Sage is my most favorite general purpose computer algebra system" |
19 | | sage: frequency_table(str) |
20 | | {'a': 5, ' ': 9, 'c': 1, 'b': 1, 'e': 8, 'g': 3, 'f': 1, 'i': 2, 'm': 4, 's': 5, 'o': 4, 'n': 1, 'p': 3, 'S': 1, 'r': 5, 'u': 2, 't': 4, 'v': 1, 'y': 2, 'l': 2} |
| 58 | sage: str = "Stop counting my characters!" |
| 59 | sage: T = sorted(frequency_table(str).items()) |
| 60 | sage: for symbol, code in T: |
| 61 | ... print symbol, code |
| 62 | ... |
| 63 | 3 |
| 64 | ! 1 |
| 65 | S 1 |
| 66 | a 2 |
| 67 | c 3 |
| 68 | e 1 |
| 69 | g 1 |
| 70 | h 1 |
| 71 | i 1 |
| 72 | m 1 |
| 73 | n 2 |
| 74 | o 2 |
| 75 | p 1 |
| 76 | r 2 |
| 77 | s 1 |
| 78 | t 3 |
| 79 | u 1 |
| 80 | y 1 |
37 | | It can build a Huffman code from a given string, or |
38 | | from the information of a dictionary associating |
39 | | to each key (the elements of the alphabet) a weight |
40 | | (most of the time, a probability value or a number |
41 | | of occurrences). For example :: |
| 102 | - ``string`` -- (default: ``None``) a string from which the Huffman |
| 103 | encoding should be created. |
| 104 | |
| 105 | - ``table`` -- (default: ``None``) a dictionary that associates to each |
| 106 | symbol of an alphabet a numeric value. If we consider the frequency of |
| 107 | each alphabetic symbol, then ``table`` is considered as the frequency |
| 108 | table of the alphabet with each numeric (non-negative integer) value |
| 109 | being the number of occurrences of a symbol. The numeric values can also |
| 110 | represent weights of the symbols. In that case, the numeric values are |
| 111 | not necessarily integers, but can be real numbers. In general, we refer |
| 112 | to ``table`` as a weight table. |
| 113 | |
| 114 | Exactly one of ``string`` and ``table`` cannot be ``None``. In order to |
| 115 | construct a Huffman code for an alphabet, we use exactly one of the |
| 116 | following methods: |
| 117 | |
| 118 | #. Let ``string`` be a string of symbols over an alphabet and feed |
| 119 | ``string`` to the constructor of this class. Based on the input string, |
| 120 | a frequency table is constructed that contains the frequency of each |
| 121 | unique symbol in ``string``. The alphabet in question is then all the |
| 122 | unique symbols in ``string``. A significant implication of this is that |
| 123 | any subsequent string that we want to encode must contain only symbols |
| 124 | that can be found in ``string``. |
| 125 | |
| 126 | #. Let ``table`` be the frequency table of an alphabet. We can feed this |
| 127 | table to the constructor of this class. The table ``table`` can be a |
| 128 | table of frequency or a table of weights. |
| 129 | |
| 130 | Examples:: |
94 | | - ``string`` -- a string from which the Huffman encoding should |
95 | | be created |
96 | | |
97 | | - ``frequencies`` -- a dictionary associating its frequency or |
98 | | its number of occurrences to each letter of the alphabet. |
99 | | |
| 182 | sage: from sage.coding.source_coding.huffman import Huffman |
| 183 | sage: T = {"a":45, "b":13, "c":12, "d":16, "e":9, "f":5} |
| 184 | sage: H = Huffman(table=T) |
| 185 | sage: L = ["deaf", "bead", "fab", "bee"] |
| 186 | sage: E = [] |
| 187 | sage: for e in L: |
| 188 | ... E.append(H.encode(e)) |
| 189 | ... print E[-1] |
| 190 | ... |
| 191 | 111110101100 |
| 192 | 10111010111 |
| 193 | 11000101 |
| 194 | 10111011101 |
| 195 | sage: D = [] |
| 196 | sage: for e in E: |
| 197 | ... D.append(H.decode(e)) |
| 198 | ... print D[-1] |
| 199 | ... |
| 200 | deaf |
| 201 | bead |
| 202 | fab |
| 203 | bee |
| 204 | sage: D == L |
| 205 | True |
164 | | self._build_code_from_tree(tree[0], d, prefix=prefix+'0') |
165 | | self._build_code_from_tree(tree[1], d, prefix=prefix+'1') |
| 269 | self._build_code_from_tree(tree[0], |
| 270 | d, |
| 271 | prefix="".join([prefix, "0"])) |
| 272 | self._build_code_from_tree(tree[1], |
| 273 | d, |
| 274 | prefix="".join([prefix, "1"])) |
174 | | |
175 | | - ``dic`` (dictionary) -- associates to each letter of the alphabet |
176 | | a frequency or a number of occurrences. |
| 284 | |
| 285 | - ``dic`` -- a dictionary that associates to each symbol of an alphabet |
| 286 | a numeric value. If we consider the frequency of each alphabetic |
| 287 | symbol, then ``dic`` is considered as the frequency table of the |
| 288 | alphabet with each numeric (non-negative integer) value being the |
| 289 | number of occurrences of a symbol. The numeric values can also |
| 290 | represent weights of the symbols. In that case, the numeric values |
| 291 | are not necessarily integers, but can be real numbers. In general, |
| 292 | we refer to ``dic`` as a weight table. |
191 | | |
192 | | for i,(e,w) in enumerate(index): |
193 | | heappush(heap, (w, i) ) |
194 | | |
195 | | while len(heap)>=2: |
196 | | (w1, i1) = heappop(heap) |
197 | | (w2, i2) = heappop(heap) |
198 | | heappush(heap, (w1+w2,[i1,i2])) |
199 | | |
200 | | |
| 304 | # Each alphabetic symbol is now represented by an element with |
| 305 | # weight w and index i. |
| 306 | for i, (s, w) in enumerate(dic.items()): |
| 307 | heappush(heap, (w, i)) |
| 308 | for i in range(1, len(dic)): |
| 309 | weight_a, node_a = heappop(heap) |
| 310 | weight_b, node_b = heappop(heap) |
| 311 | heappush(heap, (weight_a + weight_b, [node_a, node_b])) |
| 312 | # dictionary of symbol to Huffman encoding |
203 | | self._build_code_from_tree(self._tree, d) |
204 | | self._index = dict([(i,e) for i,(e,w) in enumerate(index)]) |
205 | | self._character_to_code = dict([(e,d[i]) for i,(e,w) in enumerate(index)]) |
| 315 | # Build the binary tree of a Huffman code, where the root of the tree |
| 316 | # is associated with the empty string. |
| 317 | self._build_code_from_tree(self._tree, d, prefix="") |
| 318 | self._index = dict((i, s) for i, (s, w) in enumerate(dic.items())) |
| 319 | self._character_to_code = dict( |
| 320 | (s, d[i]) for i, (s, w) in enumerate(dic.items())) |
219 | | This is how a string is encoded then decoded :: |
| 332 | - A Huffman encoding of ``string``. |
| 333 | |
| 334 | EXAMPLES: |
| 335 | |
| 336 | This is how a string is encoded and then decoded:: |
| 337 | |
| 338 | sage: from sage.coding.source_coding.huffman import Huffman |
| 339 | sage: str = "Sage is my most favorite general purpose computer algebra system" |
| 340 | sage: h = Huffman(str) |
| 341 | sage: encoded = h.encode(str); encoded |
| 342 | '00000110100010101011000011101010011100101010011011011100111101110010110100001011011111000001110101010001010110011010111111011001110100101000111110010011011100101011100000110001100101000101110101111101110110011000101011000111111101101111010010111001110100011' |
| 343 | sage: h.decode(encoded) |
| 344 | 'Sage is my most favorite general purpose computer algebra system' |
| 345 | """ |
| 346 | if self._character_to_code: |
| 347 | return "".join(map(lambda x: self._character_to_code[x], string)) |
| 348 | |
| 349 | def decode(self, string): |
| 350 | r""" |
| 351 | Decode the given string using the current encoding table. |
| 352 | |
| 353 | INPUT: |
| 354 | |
| 355 | - ``string`` -- a string of Huffman encodings. |
| 356 | |
| 357 | OUTPUT: |
| 358 | |
| 359 | - The Huffman decoding of ``string``. |
| 360 | |
| 361 | EXAMPLES: |
| 362 | |
| 363 | This is how a string is encoded and then decoded:: |
230 | | if self._character_to_code: |
231 | | return join(map(lambda x:self._character_to_code[x],string), '') |
| 383 | # This traverses the whole Huffman binary tree in order to work out |
| 384 | # the symbol represented by a stream of binaries. This method of |
| 385 | # decoding is really slow. A faster method is needed. |
| 386 | # TODO: faster decoding implementation |
| 387 | chars = [] |
| 388 | tree = self._tree |
| 389 | index = self._index |
| 390 | for i in string: |
| 391 | if i == "0": |
| 392 | tree = tree[0] |
| 393 | elif i == "1": |
| 394 | tree = tree[1] |
| 395 | else: |
| 396 | raise ValueError("Input must be a binary string.") |
| 397 | if not isinstance(tree, list): |
| 398 | chars.append(index[tree]) |
| 399 | tree = self._tree |
| 400 | return "".join(chars) |
251 | | sage: encoded = h.encode(str); encoded |
252 | | '00000110100010101011000011101010011100101010011011011100111101110010110100001011011111000001110101010001010110011010111111011001110100101000111110010011011100101011100000110001100101000101110101111101110110011000101011000111111101101111010010111001110100011' |
253 | | sage: h.decode(encoded) |
254 | | 'Sage is my most favorite general purpose computer algebra system' |
255 | | |
256 | | Of course, the string one tries to decode has to be a binary one. If |
257 | | not, an exception is raised :: |
258 | | |
259 | | sage: h.decode('I clearly am not a binary string') |
260 | | Traceback (most recent call last): |
| 419 | sage: T = sorted(h.encoding_table().items()) |
| 420 | sage: for symbol, code in T: |
| 421 | ... print symbol, code |
262 | | ValueError: The given string does not only contain 0 and 1 |
263 | | """ |
264 | | chars = [] |
265 | | tree = self._tree |
266 | | index = self._index |
267 | | for i in string: |
268 | | |
269 | | if i == '0': |
270 | | tree = tree[0] |
271 | | elif i == '1': |
272 | | tree = tree[1] |
273 | | else: |
274 | | raise ValueError('The given string does not only contain 0 and 1') |
275 | | |
276 | | if not isinstance(tree,list): |
277 | | chars.append(index[tree]) |
278 | | tree = self._tree |
279 | | |
280 | | return join(chars, '') |
281 | | |
282 | | def encoding_table(self): |
283 | | r""" |
284 | | Returns the current encoding table |
285 | | |
286 | | OUTPUT: |
287 | | |
288 | | A dictionary associating its code to each trained letter of |
289 | | the alphabet |
290 | | |
291 | | EXAMPLE:: |
292 | | |
293 | | sage: from sage.coding.source_coding.huffman import Huffman |
294 | | sage: str = "Sage is my most favorite general purpose computer algebra system" |
295 | | sage: h = Huffman(str) |
296 | | sage: h.encoding_table() |
297 | | {'S': '00000', 'a': '1101', ' ': '101', 'c': '110000', 'b': '110001', 'e': '010', 'g': '0001', 'f': '110010', 'i': '10000', 'm': '0011', 'l': '10011', 'o': '0110', 'n': '110011', 'p': '0010', 's': '1110', 'r': '1111', 'u': '10001', 't': '0111', 'v': '00001', 'y': '10010'} |
| 423 | 101 |
| 424 | S 00000 |
| 425 | a 1101 |
| 426 | b 110001 |
| 427 | c 110000 |
| 428 | e 010 |
| 429 | f 110010 |
| 430 | g 0001 |
| 431 | i 10000 |
| 432 | l 10011 |
| 433 | m 0011 |
| 434 | n 110011 |
| 435 | o 0110 |
| 436 | p 0010 |
| 437 | r 1111 |
| 438 | s 1110 |
| 439 | t 0111 |
| 440 | u 10001 |
| 441 | v 00001 |
| 442 | y 10010 |
324 | | def _generate_edges(self, tree, father='', id=''): |
325 | | if father=='': |
326 | | u = 'root' |
| 473 | def _generate_edges(self, tree, parent="", bit=""): |
| 474 | """ |
| 475 | Generate the edges of the given Huffman tree. |
| 476 | |
| 477 | INPUT: |
| 478 | |
| 479 | - ``tree`` -- a Huffman binary tree. |
| 480 | |
| 481 | - ``parent`` -- (default: empty string) a parent vertex with exactly |
| 482 | two children. |
| 483 | |
| 484 | - ``bit`` -- (default: empty string) the bit signifying either the |
| 485 | left or right branch. The bit "0" denotes the left branch and "1" |
| 486 | denotes the right branch. |
| 487 | |
| 488 | OUTPUT: |
| 489 | |
| 490 | - An edge list of the Huffman binary tree. |
| 491 | |
| 492 | EXAMPLES:: |
| 493 | |
| 494 | sage: from sage.coding.source_coding.huffman import Huffman |
| 495 | sage: H = Huffman("Sage") |
| 496 | sage: T = H.tree() |
| 497 | sage: T.edges(labels=None) |
| 498 | [('0', 'S: 01'), ('0', 'a: 00'), ('1', 'e: 10'), ('1', 'g: 11'), ('root', '0'), ('root', '1')] |
| 499 | """ |
| 500 | if parent == "": |
| 501 | u = "root" |
330 | | return self._generate_edges(tree[0], father=father+id, id='0') + \ |
331 | | self._generate_edges(tree[1], father=father+id, id='1') + \ |
332 | | ([(u, father+id)] if (father+id) != '' else []) |
333 | | |
| 506 | left = self._generate_edges(tree[0], parent=s, bit="0") |
| 507 | right = self._generate_edges(tree[1], parent=s, bit="1") |
| 508 | L = [(u, s)] if s != "" else [] |
| 509 | return left + right + L |