Ticket #10440: trac_10440.patch

File trac_10440.patch, 6.7 KB (added by ddrake, 9 years ago)
  • sage/misc/interpreter.py

    # HG changeset patch
    # User Dan Drake <drake@kaist.edu>
    # Date 1291779894 -32400
    # Node ID f84dd049bc1322776229c982a8801a7e1c1e396e
    # Parent  1cb202e8a606dc2ab85489ff9df9cb4ea78c1aa1
    improve handling of encoding declarations (trac 10440)
    
    diff --git a/sage/misc/interpreter.py b/sage/misc/interpreter.py
    a b  
    100100
    101101import os
    102102import log
     103import re
    103104
    104105import remote_file
    105106
     
    242243    return 'from %s import *'%mod
    243244
    244245def handle_encoding_declaration(contents, out):
    245     """Find a Python encoding declaration in the first line
    246     of contents. If found, output it to out and return contents without first line,
    247     else output a default UTF-8 declaration and return contents.
     246    """Find a PEP 263-style Python encoding declaration in the first or
     247    second line of `contents`. If found, output it to `out` and return
     248    `contents` without the encoding line; otherwise output a default
     249    UTF-8 declaration and return `contents`.
    248250
    249     EXAMPLE:
     251    EXAMPLES::
     252
    250253        sage: from sage.misc.interpreter import handle_encoding_declaration
    251254        sage: import sys
    252255        sage: c1='# -*- coding: latin-1 -*-\nimport os, sys\n...'
     
    255258        sage: c4='import os, sys\n...'
    256259        sage: handle_encoding_declaration(c1, sys.stdout)
    257260        # -*- coding: latin-1 -*-
    258         'import os, sys\n..'
     261        'import os, sys\n...'
    259262        sage: handle_encoding_declaration(c2, sys.stdout)
    260263        # -*- coding: iso-8859-15 -*-
    261         'import os, sys\n..'
     264        'import os, sys\n...'
    262265        sage: handle_encoding_declaration(c3, sys.stdout)
    263266        # -*- coding: ascii -*-
    264         'import os, sys\n..'
     267        'import os, sys\n...'
    265268        sage: handle_encoding_declaration(c4, sys.stdout)
    266269        # -*- coding: utf-8 -*-
    267270        'import os, sys\n...'
    268271
    269     NOTE:
    270         Python also looks for encoding hints in the second line as a the first line
    271         could contain a shebang.
    272  
    273         Better implementation possible after importing re, and then matching
    274         the regular expression
    275           coding[=:]\s*([-\w.]+)
    276         The encoding is in the first group.
    277         See http://docs.python.org/ref/encodings.html
     272    TESTS::
    278273
    279     AUTHOR:
     274    These are some of the tests listed in PEP 263.
     275
     276        sage: contents = '#!/usr/bin/python\n# -*- coding: latin-1 -*-\nimport os, sys'
     277        sage: handle_encoding_declaration(contents, sys.stdout)
     278        # -*- coding: latin-1 -*-
     279        '#!/usr/bin/python\nimport os, sys'
     280
     281        sage: contents = '# This Python file uses the following encoding: utf-8\nimport os, sys'
     282        sage: handle_encoding_declaration(contents, sys.stdout)
     283        # This Python file uses the following encoding: utf-8
     284        'import os, sys'
     285
     286        sage: contents = '#!/usr/local/bin/python\n# coding: latin-1\nimport os, sys'
     287        sage: handle_encoding_declaration(contents, sys.stdout)
     288        # coding: latin-1
     289        '#!/usr/local/bin/python\nimport os, sys'
     290
     291    Two hash marks are okay; this shows up in SageTeX-generated scripts::
     292
     293        sage: contents = '## -*- coding: utf-8 -*-\nimport os, sys\nprint x'
     294        sage: handle_encoding_declaration(contents, sys.stdout)
     295        ## -*- coding: utf-8 -*-
     296        'import os, sys\nprint x'
     297
     298    When the encoding declaration doesn't match the specification, we
     299    spit out a default UTF-8 encoding.
     300
     301    Incorrect coding line::
     302
     303        sage: contents = '#!/usr/local/bin/python\n# latin-1\nimport os, sys'
     304        sage: handle_encoding_declaration(contents, sys.stdout)
     305        # -*- coding: utf-8 -*-
     306        '#!/usr/local/bin/python\n# latin-1\nimport os, sys'
     307
     308    Encoding declaration not on first or second line::
     309
     310        sage: contents ='#!/usr/local/bin/python\n#\n# -*- coding: latin-1 -*-\nimport os, sys'
     311        sage: handle_encoding_declaration(contents, sys.stdout)
     312        # -*- coding: utf-8 -*-
     313        '#!/usr/local/bin/python\n#\n# -*- coding: latin-1 -*-\nimport os, sys'
     314       
     315    We don't check for legal encoding names; that's Python's job::
     316
     317        sage: contents ='#!/usr/local/bin/python\n# -*- coding: utf-42 -*-\nimport os, sys'
     318        sage: handle_encoding_declaration(contents, sys.stdout)
     319        # -*- coding: utf-42 -*-
     320        '#!/usr/local/bin/python\nimport os, sys'
     321
     322
     323    NOTES::
     324
     325        PEP 263: http://www.python.org/dev/peps/pep-0263/
     326
     327        PEP 263 says that Python will interpret a UTF-8 byte order mark
     328        as a declaration of UTF-8 encoding, but I don't think we do
     329        that; this function only sees a Python string so it can't
     330        account for a BOM.
     331
     332        We default to UTF-8 encoding even though PEP 263 says that
     333        Python files should default to ASCII.
     334
     335        Also see http://docs.python.org/ref/encodings.html.
     336
     337    AUTHORS::
     338
    280339        - Lars Fischer
     340        - Dan Drake (2010-12-08, rewrite for ticket #10440)
    281341    """
    282     # shebangs could also be dealt with
    283     #if (contents[0:2] == '#!'):
    284     #    pos= contents.find('\n')
    285     #    out.write(contents[0:pos]+ '\n')
    286     #    contents =  contents[pos+1:-1]
     342    lines = contents.splitlines()
     343    for num, line in enumerate(lines[:2]):
     344        if re.search(r"coding[:=]\s*([-\w.]+)", line):
     345            out.write(line + '\n')
     346            return '\n'.join(lines[:num] + lines[(num+1):])
    287347
    288     hint="coding"
    289    
    290     pos=contents.find('\n')
    291     if pos > -1:
    292         first_line = contents[0:pos]
    293     else:
    294         first_line = contents[0:]
    295 
    296     stripped_line = first_line.lstrip()
    297     if stripped_line.startswith('#'):
    298         pos=stripped_line.find(hint)
    299         if (pos > -1) and (stripped_line[pos+len(hint)] in ['=', ':']) :
    300             # we found a comment with an encoding hint
    301             # we can place it in front of the file: the line is a comment
    302             # so it does not harm
    303 
    304             out.write(first_line+'\n') # use the encoding hint specified by the user
    305            
    306             return contents[len(first_line)+1:-1]
    307 
    308     # use default encoding
     348    # If we didn't find any encoding hints, use utf-8. This is not in
     349    # conformance with PEP 263, which says that Python files default to
     350    # ascii encoding.
    309351    out.write("# -*- coding: utf-8 -*-\n")
    310    
    311     #out.write("# -*- coding: ascii -*-\n")
    312     # or ascii? Python used ascii and from 2.3 on you could specify a different
    313     # encodings.
    314     # but imho utf-8 is the better default
    315     # also read the Future compatibility note
    316     # in http://docs.python.org/ref/lexical.html
    317    
    318     # we could also write a hint for the user:
    319     #out.write("# you can specify a different encoding by a line starting with '# -*- coding:'\n")
    320     #
    321352    return contents
    322 
    323353   
    324354def preparse_file_named_to_stream(name, out):
    325355    r"""