Ticket #8765: trac_8765.patch

File trac_8765.patch, 11.9 KB (added by ncohen, 9 years ago)
  • doc/en/reference/coding.rst

    # HG changeset patch
    # User Nathann Cohen <nathann.cohen@gmail.com>
    # Date 1272189853 -7200
    # Node ID 8c3fe1294f6c4cf80bc2e8071c3b0f5ae41ec898
    # Parent  984d9ce41216f5724e054bd40a4f9aa220fde954
    trac #8765 : Huffman encoding
    
    diff -r 984d9ce41216 -r 8c3fe1294f6c doc/en/reference/coding.rst
    a b  
    77   sage/coding/linear_code
    88   sage/coding/code_constructions
    99   sage/coding/sd_codes
    10    sage/coding/code_bounds
    11  No newline at end of file
     10   sage/coding/code_bounds
     11   sage/coding/source_coding
     12 No newline at end of file
  • new file sage/coding/source_coding.py

    diff -r 984d9ce41216 -r 8c3fe1294f6c sage/coding/source_coding.py
    - +  
     1r"""
     2Huffman Encoding
     3"""
     4from string import join
     5
     6def frequency_table(string):
     7    r"""
     8    Return the frequency table corresponding to the given
     9    string.
     10
     11    INPUT:
     12
     13    - ``string`` -- a string
     14
     15    EXAMPLE::
     16
     17        sage: from sage.coding.source_coding import frequency_table
     18        sage: str = "Sage is my most favorite general purpose computer algebra system"
     19        sage: frequency_table(str)
     20        {'a': 5, ' ': 9, 'c': 1, 'b': 1, 'e': 8, 'g': 3, 'f': 1, 'i': 2, 'm': 4, 's': 5, 'o': 4, 'n': 1, 'p': 3, 'S': 1, 'r': 5, 'u': 2, 't': 4, 'v': 1, 'y': 2, 'l': 2}
     21
     22    """
     23    d = {}
     24    for l in string:
     25        d[l] = d.get(l,0) + 1
     26   
     27    return d
     28
     29
     30class Huffman():
     31    r"""
     32    Huffman Encoding
     33
     34    This class implements the basic functionalities
     35    of Huffman's encoding.
     36
     37    It can build a Huffman code from a given string, or
     38    from the information of a dictionary associating
     39    to each key (the elements of the alphabet) a weight
     40    (most of the time, a probability value or a number
     41    of occurrences). For example ::
     42
     43        sage: from sage.coding.source_coding import Huffman, frequency_table
     44        sage: h1 = Huffman("There once was a french fry")
     45        sage: for letter, code in h1.encoding_table().iteritems():
     46        ...       print "'"+ letter + "' : " + code
     47        'a' : 0111
     48        ' ' : 00
     49        'c' : 1010
     50        'e' : 100
     51        'f' : 1011
     52        'h' : 1100
     53        'o' : 11100
     54        'n' : 1101
     55        's' : 11101
     56        'r' : 010
     57        'T' : 11110
     58        'w' : 11111
     59        'y' : 0110
     60
     61    We could have obtained the same result by "training" the Huffman
     62    code on the following table of frequency ::
     63
     64        sage: ft = frequency_table("There once was a french fry"); ft
     65        {'a': 2, ' ': 5, 'c': 2, 'e': 4, 'f': 2, 'h': 2, 'o': 1, 'n': 2, 's': 1, 'r': 3, 'T': 1, 'w': 1, 'y': 1}
     66        sage: h2 = Huffman(frequencies = ft)
     67
     68    Once ``h1`` has been trained, and hence possesses an encoding code,
     69    it is possible to obtain the Huffman encoding of any string
     70    (possibly the same) using this code::
     71
     72        sage: encoded = h1.encode("There once was a french fry"); encoded
     73        '11110110010001010000111001101101010000111110111111010001110010110101001101101011000010110100110'
     74
     75    Which can be decoded the following way::
     76
     77        sage: h1.decode(encoded)
     78        'There once was a french fry'
     79
     80    Obviously, if we try to decode a string using a Huffman instance which
     81    has been trained on a different sample (and hence has a different encoding
     82    table), we are likely to get some random-looking string ::
     83
     84        sage: h3 = Huffman("There once were two french fries")
     85        sage: h3.decode(encoded)
     86        ' wehnefetrhft ne ewrowrirTc'
     87
     88    ... precisely what we deserved :-)
     89
     90    INPUT:
     91       
     92    One among the following:
     93
     94    - ``string`` -- a string from which the Huffman encoding should
     95      be created
     96
     97    - ``frequencies`` -- a dictionary associating its frequency or
     98     its number of occurrences to each letter of the alphabet.
     99
     100    """
     101
     102    def __init__(self, string = None, frequencies = None):
     103        r"""
     104        Constructor for Huffman
     105
     106        INPUT:
     107       
     108        One among the following:
     109
     110        - ``string`` -- a string from which the Huffman encoding should
     111          be created
     112
     113        - ``frequencies`` -- a dictionary associating its frequency or
     114         its number of occurrences to each letter of the alphabet.
     115       
     116        EXAMPLE::
     117
     118            sage: from sage.coding.source_coding import Huffman
     119            sage: str = "Sage is my most favorite general purpose computer algebra system"
     120            sage: h = Huffman(str)
     121
     122        If both arguments are supplied, an exception is raised ::
     123
     124            sage: Huffman(string=str, frequencies={'a':8})
     125            Traceback (most recent call last):
     126            ...
     127            ValueError: Exactly one of `string` or `frequencies` parameters must be defined
     128
     129        """
     130
     131        self._character_to_code = []
     132
     133        if sum([string is not None, frequencies is not None]) != 1:
     134            raise ValueError("Exactly one of `string` or `frequencies` parameters must be defined")
     135
     136        if string is not None:
     137            self._build_code(frequency_table(string))
     138        elif frequencies is not None:
     139            self._build_code(frequencies)
     140
     141    def _build_code_from_tree(self, tree, d, prefix=''):
     142        r"""
     143        Builds the code corresponding to a given tree and prefix
     144
     145        INPUT:
     146
     147        - ``tree`` -- integer, or list of size `2`
     148
     149        - ``d`` -- the dictionary to fill
     150
     151        - ``prefix`` (string) -- binary string which is the prefix
     152          of any element of the tree
     153
     154        EXAMPLE::
     155
     156            sage: from sage.coding.source_coding import Huffman
     157            sage: str = "Sage is my most favorite general purpose computer algebra system"
     158            sage: h = Huffman(str)
     159            sage: d = {}
     160            sage: h._build_code_from_tree(h._tree, d)
     161
     162        """
     163        try:
     164            self._build_code_from_tree(tree[0], d, prefix=prefix+'0')
     165            self._build_code_from_tree(tree[1], d, prefix=prefix+'1')
     166        except TypeError:
     167            d[tree] = prefix
     168
     169    def _build_code(self, dic):
     170        r"""
     171        Returns a Huffman code for each one of the given elements.
     172   
     173        INPUT:
     174   
     175        - ``dic`` (dictionary) -- associates to each letter of the alphabet
     176          a frequency or a number of occurrences.
     177
     178        EXAMPLE::
     179
     180            sage: from sage.coding.source_coding import Huffman, frequency_table
     181            sage: str = "Sage is my most favorite general purpose computer algebra system"
     182            sage: h = Huffman(str)
     183            sage: d = {}
     184            sage: h._build_code(frequency_table(str))
     185        """
     186   
     187        from heapq import heappush, heappop
     188   
     189        index = dic.items()
     190        heap = []
     191   
     192        for i,(e,w) in enumerate(index):
     193            heappush(heap, (w, i) )
     194   
     195        while len(heap)>=2:
     196            (w1, i1) = heappop(heap)
     197            (w2, i2) = heappop(heap)
     198            heappush(heap, (w1+w2,[i1,i2]))
     199   
     200   
     201        d = {}
     202        self._tree = heap[0][1]
     203        self._build_code_from_tree(self._tree, d)
     204        self._index = dict([(i,e) for i,(e,w) in enumerate(index)])
     205        self._character_to_code = dict([(e,d[i]) for i,(e,w) in enumerate(index)])
     206
     207   
     208    def encode(self, string):
     209        r"""
     210        Returns an encoding of the given string based
     211        on the current encoding table
     212
     213        INPUT:
     214
     215        - ``string`` (string)
     216
     217        EXAMPLE:
     218
     219        This is how a string is encoded then decoded ::
     220
     221            sage: from sage.coding.source_coding import Huffman
     222            sage: str = "Sage is my most favorite general purpose computer algebra system"
     223            sage: h = Huffman(str)
     224            sage: encoded = h.encode(str); encoded
     225            '00000110100010101011000011101010011100101010011011011100111101110010110100001011011111000001110101010001010110011010111111011001110100101000111110010011011100101011100000110001100101000101110101111101110110011000101011000111111101101111010010111001110100011'
     226            sage: h.decode(encoded)
     227            'Sage is my most favorite general purpose computer algebra system'
     228
     229        """
     230        if self._character_to_code:
     231            return join(map(lambda x:self._character_to_code[x],string), '')
     232
     233
     234    def decode(self, string):
     235        r"""
     236        Returns a decoded version of the given string
     237        corresponding to the current encoding table.
     238
     239        INPUT:
     240
     241        - ``string`` (string)
     242
     243
     244        EXAMPLE:
     245
     246        This is how a string is encoded then decoded ::
     247
     248            sage: from sage.coding.source_coding import Huffman
     249            sage: str = "Sage is my most favorite general purpose computer algebra system"
     250            sage: h = Huffman(str)
     251            sage: encoded = h.encode(str); encoded
     252            '00000110100010101011000011101010011100101010011011011100111101110010110100001011011111000001110101010001010110011010111111011001110100101000111110010011011100101011100000110001100101000101110101111101110110011000101011000111111101101111010010111001110100011'
     253            sage: h.decode(encoded)
     254            'Sage is my most favorite general purpose computer algebra system'
     255
     256        Of course, the string one tries to decode has to be a binary one. If
     257        not, an exception is raised ::
     258
     259            sage: h.decode('I clearly am not a binary string')
     260            Traceback (most recent call last):
     261            ...
     262            ValueError: The given string does not only contain 0 and 1
     263        """
     264        chars = []
     265        tree = self._tree
     266        index = self._index
     267        for i in string:
     268           
     269            if i == '0':
     270                tree = tree[0]
     271            elif i == '1':
     272                tree = tree[1]
     273            else:
     274                raise ValueError('The given string does not only contain 0 and 1')
     275
     276            if not isinstance(tree,list):
     277                chars.append(index[tree])
     278                tree = self._tree
     279
     280        return join(chars, '')
     281
     282    def encoding_table(self):
     283        r"""
     284        Returns the current encoding table
     285
     286        OUTPUT:
     287
     288        A dictionary associating its code to each trained letter of
     289        the alphabet
     290
     291        EXAMPLE::
     292
     293            sage: from sage.coding.source_coding import Huffman
     294            sage: str = "Sage is my most favorite general purpose computer algebra system"
     295            sage: h = Huffman(str)
     296            sage: h.encoding_table()
     297            {'S': '00000', 'a': '1101', ' ': '101', 'c': '110000', 'b': '110001', 'e': '010', 'g': '0001', 'f': '110010', 'i': '10000', 'm': '0011', 'l': '10011', 'o': '0110', 'n': '110011', 'p': '0010', 's': '1110', 'r': '1111', 'u': '10001', 't': '0111', 'v': '00001', 'y': '10010'}
     298        """
     299        return self._character_to_code.copy()
     300
     301    def tree(self):
     302        r"""
     303        Returns the Huffman tree corresponding to the current encoding
     304
     305        OUTPUT:
     306
     307        A tree
     308
     309        EXAMPLE::
     310
     311            sage: from sage.coding.source_coding import Huffman
     312            sage: str = "Sage is my most favorite general purpose computer algebra system"
     313            sage: h = Huffman(str)
     314            sage: T = h.tree(); T
     315            Digraph on 39 vertices
     316            sage: T.show(figsize=[20,20])
     317        """
     318
     319        from sage.graphs.digraph import DiGraph
     320        g = DiGraph()
     321        g.add_edges(self._generate_edges(self._tree))
     322        return g
     323
     324    def _generate_edges(self, tree, father='', id=''):
     325        if father=='':
     326            u = 'root'
     327        else:
     328            u = father
     329        try:
     330            return self._generate_edges(tree[0], father=father+id, id='0') + \
     331                self._generate_edges(tree[1], father=father+id, id='1') + \
     332                ([(u, father+id)] if (father+id) != '' else [])
     333
     334        except TypeError:
     335            return [(u, self.decode(father+id)+' : '+(father+id))]