Ticket #10637: tools_for_sws2rst.patch
File tools_for_sws2rst.patch, 16.9 KB (added by , 10 years ago) |
---|
-
new file sagenb/misc/comments2rst.py
# HG changeset patch # User Pablo Angulo <pablo.angulo@uam.es> # Date 1295452728 -3600 # Node ID be12df704d4e49b30b98883573f2c30298f091ba # Parent eaa34d73e6ff6d1fd06449e223c1f7cc0bde3cdc [mq]: tools_for_sws2rst.patch diff -r eaa34d73e6ff -r be12df704d4e sagenb/misc/comments2rst.py
- + 1 # -*- coding: utf-8 -*- 2 import re 3 import os 4 from BeautifulSoup import (BeautifulStoneSoup, BeautifulSoup, ICantBelieveItsBeautifulSoup, 5 Tag, NavigableString, CData, Comment, Declaration, ProcessingInstruction) 6 7 def replace_courier(soup): 8 for t in soup.findAll(lambda s:s.has_key('style') and 'courier' in s['style']): 9 #TODO: what if contents has more than one 10 c = t.contents[0] 11 tag = Tag(soup, 'code') 12 tag.insert(0,c) 13 t.replaceWith(tag) 14 15 #inline_latex is careful not to confuse escaped dollars 16 #TODO: first dollar at the beginning of a line 17 inline_latex = re.compile('([^\\\\])\\$(.*?)([^\\\\])\\$') 18 ##display_latex = re.compile('\$\$(.*?)\$\$') 19 def replace_latex(soup): 20 ## for t in soup.findAll(text = display_latex): 21 ## t.replaceWith(display_latex.sub(':math:``\\1``', 22 ## unicode(t))) 23 for t in soup.findAll(text = inline_latex): 24 t.replaceWith(inline_latex.sub('\\1:math:`\\2\\3`', 25 unicode(t))) 26 p_tags = re.compile(r'(\</?p\>)') 27 def prune_tags(text): 28 count = text.count('<p>') - text.count('</p>') 29 return text.replace('<br/>','').replace('<p>','').replace('</p>',''), count 30 31 class Soup2Rst(object): 32 """builds the rst text from the Soup Tree 33 """ 34 tags = {'h1':'header', 35 'h2':'header', 36 'h3':'header', 37 'h4':'header', 38 'p': 'inline_no_tag', 39 '[document]': 'document', 40 'br': 'br', 41 'b':'strong', 42 'strong':'strong', 43 'em':'em', 44 'pre':'pre', 45 'code':'code', 46 'display':'display', 47 'span':'inline_no_tag', 48 'ul':'ul', 49 'li':'li', 50 'a':'a', 51 'table':'table', 52 # 'tr':'tr', 53 'td':'inline_no_tag', 54 'th':'inline_no_tag', 55 'tt':'inline_no_tag', 56 'div':'block_no_tag', 57 'img':'img', 58 # '':'', 59 } 60 61 headers = {'h1':u'=', 62 'h2':u':', 63 'h3':u'~', 64 'h4':u'-', 65 } 66 67 def __init__(self, images_dir): 68 self._nested_list = 0 69 self.images_dir = images_dir 70 71 def visit(self, node): 72 if isinstance(node, (CData, Comment, Declaration, ProcessingInstruction)): 73 return '' 74 elif hasattr(node, 'name'): 75 try: 76 visitor = getattr(self, 'visit_' + self.tags[node.name]) 77 return visitor(node) 78 except (KeyError, AttributeError): 79 print 'Warning: node not supported (or something else?) ' + node.name 80 return unicode(node) 81 else: 82 #Assume plain string 83 return unicode(node) 84 85 def visit_document(self, node): 86 return '\n'.join(self.visit(tag) for tag in node.contents) 87 88 def get_plain_text(self, node): 89 '''Gets all text, removing all tags''' 90 if hasattr(node, 'contents'): 91 return ' '.join(self.get_plain_text(tag) for tag in node.contents) 92 else: 93 return unicode(node) 94 95 def visit_header(self, node): 96 s = ' '.join(self.visit(tag) for tag in node.contents) 97 spacer = self.headers[node.name]*len(s) 98 return s.replace( '\n', '') + '\n' + spacer 99 100 ## def visit_paragraph(self, node): 101 ## return ' '.join(self.visit(tag) for tag in node.contents) 102 103 def visit_pre(self, node): 104 return '::\n\n\t'+unicode(node)[5:-6].replace('<br />','\n').replace('<br></br>','\n').replace('\n','\n\t') 105 106 def visit_ul(self, node): 107 self._nested_list += 1 108 result = '\n'.join(self.visit(tag) for tag in node.contents) 109 self._nested_list -= 1 110 return result 111 112 def visit_li(self, node): 113 return ' '*self._nested_list+'- '+' '.join(self.visit(tag) for tag in node.contents) 114 115 def visit_display(self, node): 116 return '.. MATH::\n\t'+unicode(node)[9:-10].replace('<br></br>','\n').replace('\n','\n\t') 117 118 def visit_img(self, node): 119 return '.. image:: ' + os.path.join(self.images_dir, node['src']) + '\n\t:align: center\n' 120 121 def visit_table(self,node): 122 ## print unicode(node) 123 ## print node.contents 124 ## print '_' 125 rows = [] 126 for elt in node.contents: 127 ## print unicode(elt) 128 ## print elt.name if hasattr(elt,'name') else 'string' 129 if not hasattr(elt,'name'): 130 pass 131 elif elt.name == 'thead': 132 rows.extend(self.prepare_tr(row) 133 for row in elt 134 if hasattr(row,'name') and 135 row.name=='tr') 136 rows.append([]) #this row represents a separator 137 elif elt.name == 'tbody': 138 rows.extend(self.prepare_tr(row) 139 for row in elt 140 if hasattr(row,'name') and 141 row.name=='tr') 142 elif elt.name == 'tr': 143 rows.append(self.prepare_tr(elt)) 144 145 ## print rows 146 ## print '_' 147 ncols = max(len(row) for row in rows) 148 for row in rows: 149 if len(row) < ncols: 150 row.extend( ['']*(ncols - len(row))) 151 cols_sizes = [max(len(td) for td in tds_in_col) 152 for tds_in_col in zip(*rows)] 153 result = [' '.join('='*c for c in cols_sizes)] 154 ## if node.contents[0].name == 'th': 155 ## row = rows.pop(0) 156 ## result.append(' '.join(col+' '*(l - len(col)) for col in row)) 157 158 for row in rows: 159 if any(td for td in row): 160 result.append(' '.join(td+' '*(l - len(td)) 161 for l,td in zip(cols_sizes,row))) 162 else: 163 result.append(' '.join('-'*c for c in cols_sizes)) 164 result.append(' '.join('='*c for c in cols_sizes)) 165 ## print '---' 166 return '\n'.join(result) 167 168 def prepare_tr(self, node): 169 return [self.visit(tag) for tag in node.contents if tag!='\n'] 170 171 def visit_br(self, node): 172 ## return '' 173 return '\n' 174 175 #TODO: strong+code and code+strong 176 def visit_strong(self, node): 177 if node.contents: 178 return '**' + ' '.join(self.visit(tag) for tag in node.contents).strip() + '**' 179 else: 180 return '' 181 182 def visit_em(self,node): 183 if node.contents: 184 return '*' + ' '.join(self.visit(tag) for tag in node.contents).strip() + '*' 185 else: 186 return '' 187 188 #TODO: strong+code and code+strong 189 def visit_code(self, node): 190 if node.contents: 191 return '``' + self.get_plain_text(node) + '``' 192 else: 193 return '' 194 195 def visit_inline_no_tag(self, node): 196 return ' '.join(self.visit(tag) for tag in node.contents) 197 198 def visit_block_no_tag(self, node): 199 return '\n'.join(self.visit(tag) for tag in node.contents) 200 201 def visit_a(self, node): 202 return ('`' + ' '.join(self.visit(tag) for tag in node.contents) + 203 ' <' + node['href'] + '>`_' 204 ) 205 206 def html2rst(text, images_dir): 207 ls = [] 208 start_tag = True 209 partes = text.split('$$') 210 for c in partes[:-1]: 211 if start_tag: 212 ls.append(c) 213 ls.append('<display>') 214 else: 215 c0, count = prune_tags(c) 216 ls.append(c0) 217 ls.append('</display>') 218 if count == 1: 219 ls.append('<p>') 220 elif count == -1: 221 ls.append('</p>') 222 elif abs(count)>1: 223 raise Exception, 'display latex was messed up with html code' 224 start_tag = not start_tag 225 ls.append(partes[-1]) 226 text = ''.join(ls) 227 ## print text 228 229 ## soup = BeautifulSoup(text, 230 ## convertEntities=BeautifulSoup.HTML_ENTITIES) 231 soup = ICantBelieveItsBeautifulSoup(text, 232 convertEntities=ICantBelieveItsBeautifulSoup.HTML_ENTITIES) 233 234 replace_courier(soup) 235 replace_latex(soup) 236 v = Soup2Rst(images_dir) 237 return v.visit(soup) -
new file sagenb/misc/results2rst.py
diff -r eaa34d73e6ff -r be12df704d4e sagenb/misc/results2rst.py
- + 1 # -*- coding: utf-8 -*- 2 import re 3 IMAGES_DIR = 'images/' 4 5 class States(object): 6 NORMAL = 0 7 HTML = 1 8 MATH = 2 9 TRACEBACK = 3 10 11 class LineTypes(object): 12 PLAIN = 0 13 IMAGE = 1 14 LATEX = 2 15 HTML = 3 16 TRACE = 4 17 18 #TODO: tidy up 19 def results2rst(text, images_dir): 20 ##Order matters, place more restrictive regex's before more general ones 21 ##If no regex matches, line will be discarded 22 ##a self transition is needes to produce any output 23 transitions = { 24 States.NORMAL:[ 25 #IMAGE 26 (re.compile(r"^\<html\>\<font color='black'\>" 27 r"\<img src='cell\://(.*?)'\>" 28 r"\</font\>\</html\>"), 29 "\n.. image:: " + images_dir + "\\1\n\t:align: center\n", 30 LineTypes.IMAGE, 31 States.NORMAL), 32 #SELF-CONTAINED MATH 33 (re.compile(r"^\<html\>\<div class=\"math\"\>" 34 r"\\newcommand\{\\Bold\}\[1\]\{\\mathbf\{\#1\}\}" 35 r"(.*?)\</div\>\</html\>$"), 36 "\n.. MATH::\n\n\t\\1", 37 LineTypes.LATEX, 38 States.NORMAL), 39 #SELF-CONTAINED MATH - BIS 40 (re.compile(r"^\<html\>\<div class=\"math\"\>" 41 r"(.*?)\</div\>\</html\>$"), 42 "\n.. MATH::\n\n\t\\1", 43 LineTypes.LATEX, 44 States.NORMAL), 45 #START Traceback 46 (re.compile(r"^(Traceback.*)"), 47 "\tTraceback (most recent call last):", 48 LineTypes.TRACE, 49 States.TRACEBACK), 50 #START MATH 51 (re.compile(r"^\<html\>\<div class=\"math\"\>" 52 r"\\newcommand\{\\Bold\}\[1\]\{\\mathbf\{\#1\}\}(.*?)"), 53 "\n.. MATH::\n\n\t\\1", 54 LineTypes.LATEX, 55 States.MATH), 56 #SELF-CONTAINED HTML 57 (re.compile(r"^\<html\>.*</html\>$"), 58 "\t<html>...</html>", 59 LineTypes.HTML, 60 States.NORMAL), 61 #START HTML 62 (re.compile(r"^\<html\>.*"), 63 "\t<html>...</html>", 64 LineTypes.HTML, 65 States.HTML), 66 ## #START HTML 67 ## (re.compile(r"^(\<html\>.*)"), 68 ## "\n\\1", 69 ## LineTypes.HTML, 70 ## States.HTML), 71 ## #SELF-CONTAINED HTML 72 ## (re.compile(r"^(\<html\>.*</html\>)$"), 73 ## "\n\\1", 74 ## LineTypes.HTML, 75 ## States.NORMAL), 76 #CONTINUE NORMAL 77 (re.compile("(.*)"), 78 "\t\\1", 79 LineTypes.PLAIN, 80 States.NORMAL), 81 ], 82 States.MATH:[ 83 #END MATH 84 (re.compile(r"(.*?)\</div\>\</html\>$"), 85 "\t\\1", 86 LineTypes.LATEX, 87 States.NORMAL), 88 #CONTINUE MATH 89 (re.compile("(.*)"), 90 "\t\\1", 91 LineTypes.LATEX, 92 States.MATH), 93 ], 94 States.TRACEBACK:[ 95 #END Traceback 96 (re.compile(r"^(\S.*)"), 97 "\t...\n\t\\1", 98 LineTypes.TRACE, 99 States.NORMAL), 100 ], 101 States.HTML:[ 102 ## #END HTML 103 ## (re.compile(r"(.*</html\>)$"), 104 ## "\\1", 105 ## LineTypes.HTML, 106 ## States.NORMAL), 107 ## #CONTINUE HTML 108 ## (re.compile(r"(.*)"), 109 ## "\\1", 110 ## LineTypes.HTML, 111 ## States.NORMAL), 112 #END HTML 113 (re.compile(r".*</html\>$"), 114 "", 115 LineTypes.HTML, 116 States.NORMAL), 117 ], 118 } 119 result_plain = [] 120 result_show = [] 121 state = States.NORMAL 122 for line in text.splitlines(): 123 for regex, replacement, line_type, new_state in transitions[state]: 124 if regex.match(line): 125 result = result_plain if line_type in (LineTypes.PLAIN, LineTypes.HTML)\ 126 else result_show 127 result.append( regex.sub(replacement, line)) 128 state = new_state 129 break 130 ## else: 131 ## result.append('\t' + line) 132 result_plain.extend(result_show) 133 return '\n'.join(result_plain) -
new file sagenb/misc/worksheet2rst.py
diff -r eaa34d73e6ff -r be12df704d4e sagenb/misc/worksheet2rst.py
- + 1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 import sys 4 import os 5 import re 6 from comments2rst import html2rst 7 from results2rst import results2rst 8 import codecs 9 10 #TODO: dont print empty code cells 11 #How about no code, but some output? That is weird, and output is dropped 12 #This can be done with a new state: RESULT_TO_BE_DROPPED, for example 13 class States(object): 14 COMMENT = 0 15 CODE = 1 16 RESULT = 2 17 18 # REs for splitting comments, code and results 19 START_CELL_RE = re.compile('^\{\{\{id=(\d*)\|') 20 END_CODE_RE = re.compile('^\/\/\/') 21 END_CELL_RE = re.compile('^\}\}\}') 22 23 transitions = { 24 States.COMMENT:( 25 START_CELL_RE, 26 States.CODE 27 ), 28 States.CODE:( 29 END_CODE_RE, 30 States.RESULT), 31 States.RESULT:( 32 END_CELL_RE, 33 States.COMMENT) 34 } 35 36 comment_parser = html2rst 37 38 #code_parser = lambda s:u'::\n\n\tsage: '+u'\n\t... '.join(s.splitlines()) 39 def code_parser(s): 40 """ 41 42 Arguments: 43 - `s`:sage code, may or may not start with "sage:" 44 """ 45 lines = ['::\n'] 46 for s in s.splitlines(): 47 l = s[6:] if s[:6]=='sage: ' else s 48 if not l: continue 49 prefix = '\t... ' if l[0] == ' ' else '\tsage: ' 50 lines.append(prefix + l) 51 return '\n'.join(lines) 52 53 #result_parser = lambda s:u'\n'.join(u'\t'+l for l in s.splitlines()) 54 result_parser = results2rst 55 parsers = [comment_parser, code_parser, result_parser] 56 57 def parse(s, images_dir=''): 58 state = States.COMMENT 59 result = [] 60 ls = [] 61 last = 0 62 for line in s.splitlines(): 63 regex, next_state= transitions[state] 64 m = regex.match(line) 65 if m: 66 if state == States.COMMENT: 67 last_cell_id = m.group(1) 68 #TODO: los comentarios tienen imagenes 69 img_path = images_dir + os.path.sep 70 result.append(parsers[state](u'\n'.join(ls), img_path)) 71 elif state == States.RESULT: 72 img_path = os.path.join(images_dir, 'cell_%s_'%last_cell_id) 73 result.append(parsers[state](u'\n'.join(ls), 74 img_path)) 75 else: 76 result.append(parsers[state](u'\n'.join(ls))) 77 ls = [] 78 state = next_state 79 else: 80 ls.append(line) 81 ## result.append(parsers[state](u'\n'.join(ls) )) 82 if state == States.COMMENT: 83 img_path = images_dir + os.path.sep 84 result.append(parsers[state](u'\n'.join(ls), img_path)) 85 elif state == States.RESULT: 86 img_path = os.path.join(images_dir, 'cell_%s_'%last_cell_id) 87 result.append(parsers[state](u'\n'.join(ls), 88 img_path)) 89 else: 90 result.append(parsers[state](u'\n'.join(ls))) 91 92 return u'\n'.join(result) 93 94 if __name__=='__main__': 95 if len(sys.argv)>1: 96 fichero = codecs.open(sys.argv[1], mode='r', encoding='utf-8') 97 text = fichero.read() 98 fichero.close() 99 else: 100 text = sys.stdin.read() 101 102 print parse(text).encode('utf-8') 103