Ticket #8765: trac_8765-clean-ups.patch

File trac_8765-clean-ups.patch, 23.3 KB (added by mvngu, 9 years ago)
  • sage/coding/source_coding/huffman.py

    # HG changeset patch
    # User Minh Van Nguyen <nguyenminh2@gmail.com>
    # Date 1272820963 25200
    # Node ID b1fd4ee6fea24ffadfdca0895589d932146669e4
    # Parent  a8ee567274744a60cb87fddc72d82b4e05060169
    #8765: reviewer patch: Huffman codes: code clean-ups and more documentation
    
    diff --git a/sage/coding/source_coding/huffman.py b/sage/coding/source_coding/huffman.py
    a b  
    11r"""
    22Huffman Encoding
     3
     4This module implements functionalities relating to Huffman encoding and
     5decoding.
     6
     7AUTHOR:
     8
     9- Nathann Cohen (2010-05): initial version.
     10
     11
     12Classes and functions
     13=====================
    314"""
    4 from string import join
     15
     16###########################################################################
     17# Copyright (c) 2010 Nathann Cohen <nathann.cohen@gmail.com>
     18#
     19# This program is free software; you can redistribute it and/or modify
     20# it under the terms of the GNU General Public License as published by
     21# the Free Software Foundation; either version 2 of the License, or
     22# (at your option) any later version.
     23#
     24# This program is distributed in the hope that it will be useful,
     25# but WITHOUT ANY WARRANTY; without even the implied warranty of
     26# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     27# GNU General Public License for more details.
     28#
     29# http://www.gnu.org/licenses/
     30###########################################################################
     31
     32from sage.structure.sage_object import SageObject
     33
     34###########################################################################
     35#
     36# Helper functions
     37#
     38###########################################################################
    539
    640def frequency_table(string):
    741    r"""
    8     Return the frequency table corresponding to the given
    9     string.
     42    Return the frequency table corresponding to the given string.
    1043
    1144    INPUT:
    1245
    13     - ``string`` -- a string
     46    - ``string`` -- a string of symbols over some alphabet.
    1447
    15     EXAMPLE::
     48    OUTPUT:
     49
     50    - A table of frequency of each unique symbol in ``string``. If ``string``
     51      is an empty string, return an empty table.
     52
     53    EXAMPLES:
     54
     55    The frequency table of a non-empty string::
    1656
    1757        sage: from sage.coding.source_coding.huffman import frequency_table
    18         sage: str = "Sage is my most favorite general purpose computer algebra system"
    19         sage: frequency_table(str)
    20         {'a': 5, ' ': 9, 'c': 1, 'b': 1, 'e': 8, 'g': 3, 'f': 1, 'i': 2, 'm': 4, 's': 5, 'o': 4, 'n': 1, 'p': 3, 'S': 1, 'r': 5, 'u': 2, 't': 4, 'v': 1, 'y': 2, 'l': 2}
     58        sage: str = "Stop counting my characters!"
     59        sage: T = sorted(frequency_table(str).items())
     60        sage: for symbol, code in T:
     61        ...       print symbol, code
     62        ...
     63          3
     64        ! 1
     65        S 1
     66        a 2
     67        c 3
     68        e 1
     69        g 1
     70        h 1
     71        i 1
     72        m 1
     73        n 2
     74        o 2
     75        p 1
     76        r 2
     77        s 1
     78        t 3
     79        u 1
     80        y 1
    2181
     82    The frequency of an empty string::
     83
     84        sage: frequency_table("")
     85        {}
    2286    """
    2387    d = {}
    24     for l in string:
    25         d[l] = d.get(l,0) + 1
    26    
     88    for s in string:
     89        d[s] = d.get(s, 0) + 1
    2790    return d
    2891
     92class Huffman(SageObject):
     93    r"""
     94    This class implements the basic functionalities of Huffman codes.
    2995
    30 class Huffman():
    31     r"""
    32     Huffman Encoding
     96    It can build a Huffman code from a given string, or from the information
     97    of a dictionary associating to each key (the elements of the alphabet) a
     98    weight (most of the time, a probability value or a number of occurrences).
    3399
    34     This class implements the basic functionalities
    35     of Huffman's encoding.
     100    INPUT:
    36101
    37     It can build a Huffman code from a given string, or
    38     from the information of a dictionary associating
    39     to each key (the elements of the alphabet) a weight
    40     (most of the time, a probability value or a number
    41     of occurrences). For example ::
     102    - ``string`` -- (default: ``None``) a string from which the Huffman
     103      encoding should be created.
     104
     105    - ``table`` -- (default: ``None``) a dictionary that associates to each
     106      symbol of an alphabet a numeric value. If we consider the frequency of
     107      each alphabetic symbol, then ``table`` is considered as the frequency
     108      table of the alphabet with each numeric (non-negative integer) value
     109      being the number of occurrences of a symbol. The numeric values can also
     110      represent weights of the symbols. In that case, the numeric values are
     111      not necessarily integers, but can be real numbers. In general, we refer
     112      to ``table`` as a weight table.
     113
     114    Exactly one of ``string`` and ``table`` cannot be ``None``. In order to
     115    construct a Huffman code for an alphabet, we use exactly one of the
     116    following methods:
     117
     118    #. Let ``string`` be a string of symbols over an alphabet and feed
     119       ``string`` to the constructor of this class. Based on the input string,
     120       a frequency table is constructed that contains the frequency of each
     121       unique symbol in ``string``. The alphabet in question is then all the
     122       unique symbols in ``string``. A significant implication of this is that
     123       any subsequent string that we want to encode must contain only symbols
     124       that can be found in ``string``.
     125
     126    #. Let ``table`` be the frequency table of an alphabet. We can feed this
     127       table to the constructor of this class. The table ``table`` can be a
     128       table of frequency or a table of weights.
     129
     130    Examples::
    42131
    43132        sage: from sage.coding.source_coding.huffman import Huffman, frequency_table
    44133        sage: h1 = Huffman("There once was a french fry")
     
    58147        'w' : 11111
    59148        'y' : 0110
    60149
    61     We could have obtained the same result by "training" the Huffman
    62     code on the following table of frequency ::
     150    We can obtain the same result by "training" the Huffman code with the
     151    following table of frequency::
    63152
    64153        sage: ft = frequency_table("There once was a french fry"); ft
    65154        {'a': 2, ' ': 5, 'c': 2, 'e': 4, 'f': 2, 'h': 2, 'o': 1, 'n': 2, 's': 1, 'r': 3, 'T': 1, 'w': 1, 'y': 1}
    66         sage: h2 = Huffman(frequencies = ft)
     155        sage: h2 = Huffman(table=ft)
    67156
    68     Once ``h1`` has been trained, and hence possesses an encoding code,
     157    Once ``h1`` has been trained, and hence possesses an encoding table,
    69158    it is possible to obtain the Huffman encoding of any string
    70159    (possibly the same) using this code::
    71160
    72161        sage: encoded = h1.encode("There once was a french fry"); encoded
    73162        '11110110010001010000111001101101010000111110111111010001110010110101001101101011000010110100110'
    74163
    75     Which can be decoded the following way::
     164    We can decode the above encoded string in the following way::
    76165
    77166        sage: h1.decode(encoded)
    78167        'There once was a french fry'
    79168
    80169    Obviously, if we try to decode a string using a Huffman instance which
    81170    has been trained on a different sample (and hence has a different encoding
    82     table), we are likely to get some random-looking string ::
     171    table), we are likely to get some random-looking string::
    83172
    84173        sage: h3 = Huffman("There once were two french fries")
    85174        sage: h3.decode(encoded)
    86175        ' wehnefetrhft ne ewrowrirTc'
    87176
    88     ... precisely what we deserved :-)
     177    This does not look like our original string.
    89178
    90     INPUT:
    91        
    92     One among the following:
     179    Instead of using frequency, we can assign weights to each alphabetic
     180    symbol::
    93181
    94     - ``string`` -- a string from which the Huffman encoding should
    95       be created
    96 
    97     - ``frequencies`` -- a dictionary associating its frequency or
    98      its number of occurrences to each letter of the alphabet.
    99 
     182        sage: from sage.coding.source_coding.huffman import Huffman
     183        sage: T = {"a":45, "b":13, "c":12, "d":16, "e":9, "f":5}
     184        sage: H = Huffman(table=T)
     185        sage: L = ["deaf", "bead", "fab", "bee"]
     186        sage: E = []
     187        sage: for e in L:
     188        ...       E.append(H.encode(e))
     189        ...       print E[-1]
     190        ...
     191        111110101100
     192        10111010111
     193        11000101
     194        10111011101
     195        sage: D = []
     196        sage: for e in E:
     197        ...       D.append(H.decode(e))
     198        ...       print D[-1]
     199        ...
     200        deaf
     201        bead
     202        fab
     203        bee
     204        sage: D == L
     205        True
    100206    """
    101207
    102     def __init__(self, string = None, frequencies = None):
     208    def __init__(self, string=None, table=None):
    103209        r"""
    104         Constructor for Huffman
     210        Constructor for Huffman.
    105211
    106         INPUT:
    107        
    108         One among the following:
     212        See the docstring of this class for full documentation.
    109213
    110         - ``string`` -- a string from which the Huffman encoding should
    111           be created
    112 
    113         - ``frequencies`` -- a dictionary associating its frequency or
    114          its number of occurrences to each letter of the alphabet.
    115        
    116         EXAMPLE::
     214        EXAMPLES::
    117215
    118216            sage: from sage.coding.source_coding.huffman import Huffman
    119217            sage: str = "Sage is my most favorite general purpose computer algebra system"
    120218            sage: h = Huffman(str)
    121219
    122         If both arguments are supplied, an exception is raised ::
     220        TESTS:
    123221
    124             sage: Huffman(string=str, frequencies={'a':8})
     222        If both arguments are supplied, an exception is raised::
     223
     224            sage: Huffman(string=str, table={'a':8})
    125225            Traceback (most recent call last):
    126226            ...
    127             ValueError: Exactly one of `string` or `frequencies` parameters must be defined
     227            ValueError: Exactly one of 'string' and 'table' cannot be None.
     228        """
     229        if (string is not None) and (table is not None):
     230            raise ValueError(
     231                "Exactly one of 'string' and 'table' cannot be None.")
    128232
    129         """
    130 
     233        # alphabetic symbol to Huffman encoding translation table
    131234        self._character_to_code = []
    132 
    133         if sum([string is not None, frequencies is not None]) != 1:
    134             raise ValueError("Exactly one of `string` or `frequencies` parameters must be defined")
    135 
     235        # Huffman binary tree
     236        self._tree = None
     237        # index of each alphabetic symbol
     238        self._index = None
    136239        if string is not None:
    137240            self._build_code(frequency_table(string))
    138         elif frequencies is not None:
    139             self._build_code(frequencies)
     241        elif table is not None:
     242            self._build_code(table)
    140243
    141     def _build_code_from_tree(self, tree, d, prefix=''):
     244    def _build_code_from_tree(self, tree, d, prefix):
    142245        r"""
    143         Builds the code corresponding to a given tree and prefix
     246        Builds the Huffman code corresponding to a given tree and prefix.
    144247
    145248        INPUT:
    146249
     
    151254        - ``prefix`` (string) -- binary string which is the prefix
    152255          of any element of the tree
    153256
    154         EXAMPLE::
     257        EXAMPLES::
    155258
    156259            sage: from sage.coding.source_coding.huffman import Huffman
    157260            sage: str = "Sage is my most favorite general purpose computer algebra system"
    158261            sage: h = Huffman(str)
    159262            sage: d = {}
    160             sage: h._build_code_from_tree(h._tree, d)
    161 
     263            sage: h._build_code_from_tree(h._tree, d, prefix="")
    162264        """
     265        # This is really a recursive construction of a Huffman code. By
     266        # feeding this class a sufficiently large alphabet, it is possible to
     267        # exceed the maximum recursion depth and hence result in a RuntimeError.
    163268        try:
    164             self._build_code_from_tree(tree[0], d, prefix=prefix+'0')
    165             self._build_code_from_tree(tree[1], d, prefix=prefix+'1')
     269            self._build_code_from_tree(tree[0],
     270                                       d,
     271                                       prefix="".join([prefix, "0"]))
     272            self._build_code_from_tree(tree[1],
     273                                       d,
     274                                       prefix="".join([prefix, "1"]))
    166275        except TypeError:
    167276            d[tree] = prefix
    168277
    169278    def _build_code(self, dic):
    170279        r"""
    171         Returns a Huffman code for each one of the given elements.
    172    
     280        Constructs a Huffman code corresponding to an alphabet with the given
     281        weight table.
     282
    173283        INPUT:
    174    
    175         - ``dic`` (dictionary) -- associates to each letter of the alphabet
    176           a frequency or a number of occurrences.
     284
     285        - ``dic`` -- a dictionary that associates to each symbol of an alphabet
     286          a numeric value. If we consider the frequency of each alphabetic
     287          symbol, then ``dic`` is considered as the frequency table of the
     288          alphabet with each numeric (non-negative integer) value being the
     289          number of occurrences of a symbol. The numeric values can also
     290          represent weights of the symbols. In that case, the numeric values
     291          are not necessarily integers, but can be real numbers. In general,
     292          we refer to ``dic`` as a weight table.
    177293
    178294        EXAMPLE::
    179295
     
    183299            sage: d = {}
    184300            sage: h._build_code(frequency_table(str))
    185301        """
    186    
    187302        from heapq import heappush, heappop
    188    
    189         index = dic.items()
    190303        heap = []
    191    
    192         for i,(e,w) in enumerate(index):
    193             heappush(heap, (w, i) )
    194    
    195         while len(heap)>=2:
    196             (w1, i1) = heappop(heap)
    197             (w2, i2) = heappop(heap)
    198             heappush(heap, (w1+w2,[i1,i2]))
    199    
    200    
     304        # Each alphabetic symbol is now represented by an element with
     305        # weight w and index i.
     306        for i, (s, w) in enumerate(dic.items()):
     307            heappush(heap, (w, i))
     308        for i in range(1, len(dic)):
     309            weight_a, node_a = heappop(heap)
     310            weight_b, node_b = heappop(heap)
     311            heappush(heap, (weight_a + weight_b, [node_a, node_b]))
     312        # dictionary of symbol to Huffman encoding
    201313        d = {}
    202314        self._tree = heap[0][1]
    203         self._build_code_from_tree(self._tree, d)
    204         self._index = dict([(i,e) for i,(e,w) in enumerate(index)])
    205         self._character_to_code = dict([(e,d[i]) for i,(e,w) in enumerate(index)])
     315        # Build the binary tree of a Huffman code, where the root of the tree
     316        # is associated with the empty string.
     317        self._build_code_from_tree(self._tree, d, prefix="")
     318        self._index = dict((i, s) for i, (s, w) in enumerate(dic.items()))
     319        self._character_to_code = dict(
     320            (s, d[i]) for i, (s, w) in enumerate(dic.items()))
    206321
    207    
    208322    def encode(self, string):
    209323        r"""
    210         Returns an encoding of the given string based
    211         on the current encoding table
     324        Encode the given string based on the current encoding table.
    212325
    213326        INPUT:
    214327
    215         - ``string`` (string)
     328        - ``string`` -- a string of symbols over an alphabet.
    216329
    217         EXAMPLE:
     330        OUTPUT:
    218331
    219         This is how a string is encoded then decoded ::
     332        - A Huffman encoding of ``string``.
     333
     334        EXAMPLES:
     335
     336        This is how a string is encoded and then decoded::
     337
     338            sage: from sage.coding.source_coding.huffman import Huffman
     339            sage: str = "Sage is my most favorite general purpose computer algebra system"
     340            sage: h = Huffman(str)
     341            sage: encoded = h.encode(str); encoded
     342            '00000110100010101011000011101010011100101010011011011100111101110010110100001011011111000001110101010001010110011010111111011001110100101000111110010011011100101011100000110001100101000101110101111101110110011000101011000111111101101111010010111001110100011'
     343            sage: h.decode(encoded)
     344            'Sage is my most favorite general purpose computer algebra system'
     345        """
     346        if self._character_to_code:
     347            return "".join(map(lambda x: self._character_to_code[x], string))
     348
     349    def decode(self, string):
     350        r"""
     351        Decode the given string using the current encoding table.
     352
     353        INPUT:
     354
     355        - ``string`` -- a string of Huffman encodings.
     356
     357        OUTPUT:
     358
     359        - The Huffman decoding of ``string``.
     360
     361        EXAMPLES:
     362
     363        This is how a string is encoded and then decoded::
    220364
    221365            sage: from sage.coding.source_coding.huffman import Huffman
    222366            sage: str = "Sage is my most favorite general purpose computer algebra system"
     
    226370            sage: h.decode(encoded)
    227371            'Sage is my most favorite general purpose computer algebra system'
    228372
     373        TESTS:
     374
     375        Of course, the string one tries to decode has to be a binary one. If
     376        not, an exception is raised::
     377
     378            sage: h.decode('I clearly am not a binary string')
     379            Traceback (most recent call last):
     380            ...
     381            ValueError: Input must be a binary string.
    229382        """
    230         if self._character_to_code:
    231             return join(map(lambda x:self._character_to_code[x],string), '')
     383        # This traverses the whole Huffman binary tree in order to work out
     384        # the symbol represented by a stream of binaries. This method of
     385        # decoding is really slow. A faster method is needed.
     386        # TODO: faster decoding implementation
     387        chars = []
     388        tree = self._tree
     389        index = self._index
     390        for i in string:
     391            if i == "0":
     392                tree = tree[0]
     393            elif i == "1":
     394                tree = tree[1]
     395            else:
     396                raise ValueError("Input must be a binary string.")
     397            if not isinstance(tree, list):
     398                chars.append(index[tree])
     399                tree = self._tree
     400        return "".join(chars)
    232401
    233 
    234     def decode(self, string):
     402    def encoding_table(self):
    235403        r"""
    236         Returns a decoded version of the given string
    237         corresponding to the current encoding table.
     404        Returns the current encoding table.
    238405
    239406        INPUT:
    240407
    241         - ``string`` (string)
     408        - None.
    242409
     410        OUTPUT:
    243411
    244         EXAMPLE:
     412        - A dictionary associating an alphabetic symbol to a Huffman encoding.
    245413
    246         This is how a string is encoded then decoded ::
     414        EXAMPLES::
    247415
    248416            sage: from sage.coding.source_coding.huffman import Huffman
    249417            sage: str = "Sage is my most favorite general purpose computer algebra system"
    250418            sage: h = Huffman(str)
    251             sage: encoded = h.encode(str); encoded
    252             '00000110100010101011000011101010011100101010011011011100111101110010110100001011011111000001110101010001010110011010111111011001110100101000111110010011011100101011100000110001100101000101110101111101110110011000101011000111111101101111010010111001110100011'
    253             sage: h.decode(encoded)
    254             'Sage is my most favorite general purpose computer algebra system'
    255 
    256         Of course, the string one tries to decode has to be a binary one. If
    257         not, an exception is raised ::
    258 
    259             sage: h.decode('I clearly am not a binary string')
    260             Traceback (most recent call last):
     419            sage: T = sorted(h.encoding_table().items())
     420            sage: for symbol, code in T:
     421            ...       print symbol, code
    261422            ...
    262             ValueError: The given string does not only contain 0 and 1
    263         """
    264         chars = []
    265         tree = self._tree
    266         index = self._index
    267         for i in string:
    268            
    269             if i == '0':
    270                 tree = tree[0]
    271             elif i == '1':
    272                 tree = tree[1]
    273             else:
    274                 raise ValueError('The given string does not only contain 0 and 1')
    275 
    276             if not isinstance(tree,list):
    277                 chars.append(index[tree])
    278                 tree = self._tree
    279 
    280         return join(chars, '')
    281 
    282     def encoding_table(self):
    283         r"""
    284         Returns the current encoding table
    285 
    286         OUTPUT:
    287 
    288         A dictionary associating its code to each trained letter of
    289         the alphabet
    290 
    291         EXAMPLE::
    292 
    293             sage: from sage.coding.source_coding.huffman import Huffman
    294             sage: str = "Sage is my most favorite general purpose computer algebra system"
    295             sage: h = Huffman(str)
    296             sage: h.encoding_table()
    297             {'S': '00000', 'a': '1101', ' ': '101', 'c': '110000', 'b': '110001', 'e': '010', 'g': '0001', 'f': '110010', 'i': '10000', 'm': '0011', 'l': '10011', 'o': '0110', 'n': '110011', 'p': '0010', 's': '1110', 'r': '1111', 'u': '10001', 't': '0111', 'v': '00001', 'y': '10010'}
     423              101
     424            S 00000
     425            a 1101
     426            b 110001
     427            c 110000
     428            e 010
     429            f 110010
     430            g 0001
     431            i 10000
     432            l 10011
     433            m 0011
     434            n 110011
     435            o 0110
     436            p 0010
     437            r 1111
     438            s 1110
     439            t 0111
     440            u 10001
     441            v 00001
     442            y 10010
    298443        """
    299444        return self._character_to_code.copy()
    300445
    301446    def tree(self):
    302447        r"""
    303         Returns the Huffman tree corresponding to the current encoding
     448        Returns the Huffman tree corresponding to the current encoding.
     449
     450        INPUT:
     451
     452        - None.
    304453
    305454        OUTPUT:
    306455
    307         A tree
     456        - The binary tree representing a Huffman code.
    308457
    309         EXAMPLE::
     458        EXAMPLES::
    310459
    311460            sage: from sage.coding.source_coding.huffman import Huffman
    312461            sage: str = "Sage is my most favorite general purpose computer algebra system"
     
    314463            sage: T = h.tree(); T
    315464            Digraph on 39 vertices
    316465            sage: T.show(figsize=[20,20])
     466            <BLANKLINE>
    317467        """
    318 
    319468        from sage.graphs.digraph import DiGraph
    320469        g = DiGraph()
    321470        g.add_edges(self._generate_edges(self._tree))
    322471        return g
    323472
    324     def _generate_edges(self, tree, father='', id=''):
    325         if father=='':
    326             u = 'root'
     473    def _generate_edges(self, tree, parent="", bit=""):
     474        """
     475        Generate the edges of the given Huffman tree.
     476
     477        INPUT:
     478
     479        - ``tree`` -- a Huffman binary tree.
     480
     481        - ``parent`` -- (default: empty string) a parent vertex with exactly
     482          two children.
     483
     484        - ``bit`` -- (default: empty string) the bit signifying either the
     485          left or right branch. The bit "0" denotes the left branch and "1"
     486          denotes the right branch.
     487
     488        OUTPUT:
     489
     490        - An edge list of the Huffman binary tree.
     491
     492        EXAMPLES::
     493
     494            sage: from sage.coding.source_coding.huffman import Huffman
     495            sage: H = Huffman("Sage")
     496            sage: T = H.tree()
     497            sage: T.edges(labels=None)
     498            [('0', 'S: 01'), ('0', 'a: 00'), ('1', 'e: 10'), ('1', 'g: 11'), ('root', '0'), ('root', '1')]
     499        """
     500        if parent == "":
     501            u = "root"
    327502        else:
    328             u = father
     503            u = parent
     504        s = "".join([parent, bit])
    329505        try:
    330             return self._generate_edges(tree[0], father=father+id, id='0') + \
    331                 self._generate_edges(tree[1], father=father+id, id='1') + \
    332                 ([(u, father+id)] if (father+id) != '' else [])
    333 
     506            left = self._generate_edges(tree[0], parent=s, bit="0")
     507            right = self._generate_edges(tree[1], parent=s, bit="1")
     508            L = [(u, s)] if s != "" else []
     509            return left + right + L
    334510        except TypeError:
    335             return [(u, self.decode(father+id)+' : '+(father+id))]
     511            return [(u, "".join([self.decode(s), ": ", s]))]