# HG changeset patch
# User Pablo Angulo Ardoy (pang) <pablo.angulo@uam.es>
# Date 1349798120 -7200
# Node ID ef9db668ad1d9c5a56e5f201fbcef01c4abb94bc
# Parent d97775b5afedd24b48054aaf8750206ea041a2bf
[mq]: sws2rst_answer_to_kcrisman
diff --git a/sagenb/misc/comments2rst.py b/sagenb/misc/comments2rst.py
a
|
b
|
|
33 | 33 | in the Sage shell (sage --sh). |
34 | 34 | """ |
35 | 35 | |
| 36 | #negative lookbehind: http://www.regular-expressions.info/lookaround.html |
| 37 | double_dollar = re.compile(r'(?<!\\)\$\$') |
36 | 38 | def preprocess_display_latex(text): |
37 | 39 | r"""replace $$some display latex$$ with <display>some display latex</display> |
38 | 40 | before the soup is built. |
… |
… |
|
58 | 60 | """ |
59 | 61 | ls = [] |
60 | 62 | start_tag = True |
61 | | partes = text.split('$$') |
62 | | for c in partes[:-1]: |
| 63 | parts = double_dollar.split(text) |
| 64 | for c in parts[:-1]: |
63 | 65 | if start_tag: |
64 | 66 | ls.append(c) |
65 | 67 | ls.append('<display>') |
… |
… |
|
74 | 76 | elif abs(count)>1: |
75 | 77 | raise Exception, 'display latex was messed up with html code' |
76 | 78 | start_tag = not start_tag |
77 | | ls.append(partes[-1]) |
| 79 | ls.append(parts[-1]) |
78 | 80 | return ''.join(ls) |
79 | 81 | |
80 | 82 | def prune_tags(text): |
… |
… |
|
90 | 92 | text = text.replace(c,r) |
91 | 93 | return text |
92 | 94 | |
| 95 | #This is supposed to be handled by BeautifulSoup, but doesn't work |
| 96 | xml_entities = {'<':'<', |
| 97 | '>':'>', |
| 98 | '&':'&', |
| 99 | '"':'"', |
| 100 | ''':"'", |
| 101 | } |
| 102 | def replace_xml_entities(text): |
| 103 | for c,r in xml_entities.iteritems(): |
| 104 | text = text.replace(c,r) |
| 105 | return text |
| 106 | |
| 107 | |
93 | 108 | def replace_courier(soup): |
94 | 109 | """Lacking a better option, I use courier font to mark <code> |
95 | 110 | within tinyMCE. And I want to turn that into real code tags. |
96 | 111 | |
97 | | Most users won't be needing this(?) |
| 112 | Most users won't be needing this(?), so this code is not called anywhere |
| 113 | but kept for reference |
98 | 114 | """ |
99 | 115 | for t in soup.findAll(lambda s:s.has_key('style') and 'courier' in s['style']): |
100 | 116 | tag = Tag(soup, 'code') |
… |
… |
|
102 | 118 | tag.append(t.contents[0]) |
103 | 119 | t.replaceWith(tag) |
104 | 120 | |
105 | | #inline_latex is careful not to confuse escaped dollars |
106 | | inline_latex = re.compile(r'([^\\])\$(.*?)([^\\])\$') |
107 | | latex_beginning = re.compile(r'\$(.*?)([^\\])\$') |
| 121 | #negative lookbehind: http://www.regular-expressions.info/lookaround.html |
| 122 | single_dollar = re.compile(r'(?<!\\)\$') |
108 | 123 | def replace_latex(soup): |
109 | 124 | r"""Replaces inline latex by :math:`code` and escapes |
110 | 125 | some rst special chars like +, -, * and | outside of inline latex |
… |
… |
|
125 | 140 | <p><strong>2\+2 \| 1\+3</strong></p> |
126 | 141 | """ |
127 | 142 | for t in soup.findAll(text=re.compile('.+')): |
128 | | if latex_beginning.match(t): |
129 | | t.replaceWith(inline_latex.sub('\\1:math:`\\2\\3`', |
130 | | latex_beginning.sub(':math:`\\1\\2`', |
131 | | unicode(t), |
132 | | 1))) |
133 | | elif inline_latex.search(t): |
134 | | t.replaceWith(inline_latex.sub('\\1:math:`\\2\\3`', |
135 | | unicode(t))) |
136 | | elif not (t.fetchParents(name = 'display') |
137 | | or t.fetchParents(name = 'pre')): |
138 | | t.replaceWith(escape_chars(t)) |
| 143 | if (t.fetchParents(name = 'display') or |
| 144 | t.fetchParents(name = 'pre') ): |
| 145 | continue |
| 146 | parts = single_dollar.split(unicode(t)) |
| 147 | even = [escape_chars(parts[i]) for i in range(0,len(parts),2)] |
| 148 | odd = [' :math:`%s`'%parts[i] for i in range(1,len(parts),2)] |
| 149 | odd.append('') |
| 150 | t.replaceWith(''.join(''.join(p) for p in zip(even,odd) )) |
139 | 151 | |
140 | 152 | class Soup2Rst(object): |
141 | 153 | """builds the rst text from the Soup Tree |
… |
… |
|
144 | 156 | 'h2':'header', |
145 | 157 | 'h3':'header', |
146 | 158 | 'h4':'header', |
147 | | 'p': 'inline_no_tag', |
| 159 | 'p': 'p', |
148 | 160 | '[document]': 'document', |
149 | 161 | 'br': 'br', |
150 | 162 | 'b':'strong', |
… |
… |
|
170 | 182 | |
171 | 183 | headers = {'h1':u'=', |
172 | 184 | 'h2':u'-', |
173 | | 'h3':u'~', |
| 185 | 'h3':u'^', |
174 | 186 | 'h4':u'"', |
| 187 | 'h5':u'~', |
175 | 188 | } |
176 | 189 | |
177 | 190 | def __init__(self, images_dir): |
178 | 191 | self.images_dir = images_dir |
179 | | self._nested_list = 0 |
180 | | self._inside_ol = False |
| 192 | self._nested_list = -1 |
| 193 | self._inside_ol_or_ul = [] |
181 | 194 | self._inside_code_tag = False |
182 | 195 | |
183 | 196 | def visit(self, node): |
… |
… |
|
206 | 219 | return t.replace('\n','') |
207 | 220 | |
208 | 221 | def visit_header(self, node): |
209 | | s = ' '.join(self.visit(tag) for tag in node.contents) |
| 222 | s = ''.join(self.visit(tag) for tag in node.contents) |
210 | 223 | spacer = self.headers[node.name]*len(s) |
211 | 224 | return s.replace( '\n', '') + '\n' + spacer |
212 | 225 | |
… |
… |
|
215 | 228 | |
216 | 229 | def visit_ul(self, node): |
217 | 230 | self._nested_list += 1 |
218 | | result = '\n'.join(self.visit(tag) for tag in node.contents) |
| 231 | self._inside_ol_or_ul.append(False) |
| 232 | result = '\n\n'+''.join(self.visit(tag) for tag in node.contents)+'\n' |
| 233 | self._inside_ol_or_ul.pop() |
219 | 234 | self._nested_list -= 1 |
220 | 235 | return result |
221 | 236 | |
222 | 237 | def visit_ol(self, node): |
223 | 238 | self._nested_list += 1 |
224 | | self._inside_ol = True |
225 | | result = '\n'.join(self.visit(tag) for tag in node.contents) |
| 239 | self._inside_ol_or_ul.append(True) |
| 240 | result = '\n\n'+''.join(self.visit(tag) for tag in node.contents)+'\n' |
| 241 | self._inside_ol_or_ul.pop() |
226 | 242 | self._nested_list -= 1 |
227 | | self._inside_ol = False |
228 | 243 | return result |
229 | 244 | |
230 | 245 | def visit_li(self, node): |
231 | 246 | return (' '*self._nested_list |
232 | | + ('#. ' if self._inside_ol else '- ') |
233 | | +' '.join(self.visit(tag) for tag in node.contents)) |
| 247 | + ('#. ' if self._inside_ol_or_ul[-1] else '- ') |
| 248 | +' '.join(self.visit(tag) for tag in node.contents) |
| 249 | + '\n') |
234 | 250 | |
235 | 251 | def visit_display(self, node): |
236 | | return ('\n.. MATH::\n\n ' + |
| 252 | return ('\n\n.. MATH::\n\n ' + |
237 | 253 | unicode(node)[9:-10].replace('<br></br>','\n').replace('\n','\n ') + |
238 | | '\n\n') |
| 254 | '\n\n.. end of math\n\n') |
239 | 255 | |
240 | 256 | def visit_img(self, node): |
241 | 257 | return '.. image:: ' + os.path.join(self.images_dir, node['src'].replace(' ','_')) + '\n :align: center\n' |
… |
… |
|
251 | 267 | if hasattr(row,'name') and |
252 | 268 | row.name=='tr') |
253 | 269 | rows.append([]) #this row represents a separator |
254 | | elif elt.name == 'tbody': |
| 270 | elif (elt.name == 'tbody') or (elt.name == 'tfoot'): |
255 | 271 | rows.extend(self.prepare_tr(row) |
256 | 272 | for row in elt |
257 | 273 | if hasattr(row,'name') and |
… |
… |
|
285 | 301 | def visit_strong(self, node): |
286 | 302 | if node.contents: |
287 | 303 | content = ' '.join(self.visit(tag) for tag in node.contents).strip() |
288 | | if '``' in content or self._inside_code_tag: |
| 304 | if '``' in content: |
289 | 305 | return content |
290 | 306 | else: |
291 | 307 | return '**' + content + '**' |
… |
… |
|
294 | 310 | |
295 | 311 | def visit_em(self,node): |
296 | 312 | if node.contents: |
297 | | return '*' + ' '.join(self.visit(tag) for tag in node.contents).strip() + '*' |
| 313 | return ' *' + ' '.join(self.visit(tag) for tag in node.contents).strip() + '* ' |
298 | 314 | else: |
299 | 315 | return '' |
300 | 316 | |
301 | 317 | def visit_code(self, node): |
302 | 318 | if node.contents: |
303 | | self._inside_code_tag = True |
304 | 319 | content = self.get_plain_text(node).strip() |
305 | | self._inside_code_tag = False |
306 | 320 | return '``' + content + '``' |
307 | 321 | else: |
308 | 322 | return '' |
309 | 323 | |
310 | 324 | def visit_inline_no_tag(self, node): |
311 | 325 | return (' '.join(self.visit(tag) |
312 | | for tag in node.contents)).strip() + '\n' |
| 326 | for tag in node.contents)).strip() |
313 | 327 | |
314 | 328 | def visit_block_no_tag(self, node): |
315 | | return '\n'.join(self.visit(tag) for tag in node.contents) |
| 329 | return '\n'.join(self.visit(tag) for tag in node.contents) + '\n' |
| 330 | |
| 331 | def visit_p(self, node): |
| 332 | return ''.join(self.visit(tag) for tag in node.contents) + '\n\n' |
316 | 333 | |
317 | 334 | def visit_a(self, node): |
318 | | return ('`' + ' '.join(self.visit(tag) for tag in node.contents) + |
319 | | ' <' + node['href'] + '>`_' |
320 | | ) |
| 335 | c = ' '.join(self.visit(tag) for tag in node.contents) |
| 336 | try: |
| 337 | link = node['href'] |
| 338 | if link[0]=='#': |
| 339 | return ':ref:`%s <%s>`'%(c, link[1:]) |
| 340 | else: |
| 341 | return '`%s <%s>`_'%(c, link) |
| 342 | except KeyError: |
| 343 | return '.. _%s:\n\n'%node['name'] |
| 344 | |
321 | 345 | |
322 | 346 | def html2rst(text, images_dir): |
323 | 347 | """Converts html, tipically generated by tinyMCE, into rst |
… |
… |
|
364 | 388 | #ICantBelieveItsBeautifulSoup is better than BeautifulSoup |
365 | 389 | #for html that wasn't generated by humans (like tinyMCE) |
366 | 390 | soup = ICantBelieveItsBeautifulSoup(text, |
367 | | convertEntities=ICantBelieveItsBeautifulSoup.HTML_ENTITIES) |
| 391 | convertEntities=ICantBelieveItsBeautifulSoup.ALL_ENTITIES) |
368 | 392 | |
369 | 393 | #remove all comments |
370 | 394 | comments = soup.findAll(text=lambda text:isinstance(text, Comment)) |
371 | 395 | for comment in comments: |
372 | 396 | comment.extract() |
373 | 397 | |
374 | | replace_courier(soup) |
| 398 | # replace_courier(soup) |
375 | 399 | replace_latex(soup) |
376 | 400 | v = Soup2Rst(images_dir) |
377 | | return v.visit(soup) |
| 401 | |
| 402 | # return v.visit(soup) |
| 403 | text = v.visit(soup) |
| 404 | more_than_2_blank_lines = re.compile(r'\n\n+', re.MULTILINE) |
| 405 | text = more_than_2_blank_lines.sub('\n\n', text) |
| 406 | text = replace_xml_entities(text) |
| 407 | return text |
| 408 | |
diff --git a/sagenb/misc/worksheet2rst.py b/sagenb/misc/worksheet2rst.py
a
|
b
|
|
116 | 116 | lines.append(prefix + l) |
117 | 117 | return '\n'.join(lines) |
118 | 118 | |
| 119 | HEADER_RE = re.compile(r'<h\d>') |
| 120 | def add_title_if_there_is_none(text): |
| 121 | if not HEADER_RE.search(text): |
| 122 | return '<h1>Please write a title for this worksheet!</h1>\n' + text |
| 123 | else: |
| 124 | return text |
| 125 | |
119 | 126 | def worksheet2rst(s, images_dir=''): |
120 | 127 | """Parses a string, tipically the content of the file |
121 | 128 | worksheet.html inside a sws file, and converts it into |
… |
… |
|
142 | 149 | : worksheet2rst(s) |
143 | 150 | u'.. -*- coding: utf-8 -*-\n\n\n::\n\n sage: show(f)\n\n.. MATH::\n\n \\sqrt{x}\n\n.. end of output\n' |
144 | 151 | """ |
145 | | result_parser = results2rst |
| 152 | s = add_title_if_there_is_none(s) |
146 | 153 | state = States.COMMENT |
147 | 154 | result = ['.. -*- coding: utf-8 -*-\n'] |
148 | 155 | ls = [] |
… |
… |
|
157 | 164 | result.append(html2rst(u'\n'.join(ls), img_path)) |
158 | 165 | elif state == States.RESULT: |
159 | 166 | img_path = os.path.join(images_dir, 'cell_%s_'%last_cell_id) |
160 | | result.append(result_parser(u'\n'.join(ls), |
| 167 | result.append(results2rst(u'\n'.join(ls), |
161 | 168 | img_path)) |
162 | 169 | result.append('') |
163 | 170 | result.append('.. end of output') |
… |
… |
|
191 | 198 | fichero.close() |
192 | 199 | else: |
193 | 200 | text = sys.stdin.read() |
| 201 | images_dir = sys.argv[2] if len(sys.argv)>2 else '' |
194 | 202 | |
195 | | print worksheet2rst(text).encode('utf-8') |
| 203 | print worksheet2rst(text, images_dir).encode('utf-8') |
196 | 204 | |