| 1 | # -*- coding: utf-8 -*- |
| 2 | r""" |
| 3 | Convert html from text cells in the notebook into ReStructuredText |
| 4 | |
| 5 | This is called by sws2rst |
| 6 | |
| 7 | - Pablo Angulo Ardoy (2011-02-25): initial version |
| 8 | """ |
| 9 | #************************************************** |
| 10 | # Copyright (C) 2011 Pablo Angulo |
| 11 | # |
| 12 | # Distributed under the terms of the GPL License |
| 13 | #************************************************** |
| 14 | |
| 15 | |
| 16 | import re |
| 17 | import os |
| 18 | try: |
| 19 | from BeautifulSoup import (ICantBelieveItsBeautifulSoup, Tag, |
| 20 | CData, Comment, Declaration, ProcessingInstruction) |
| 21 | except ImportError: |
| 22 | raise ImportError, """BeautifulSoup must be installed. |
| 23 | |
| 24 | You might download a spkg from: |
| 25 | |
| 26 | http://trac.sagemath.org/sage_trac/raw-attachment/ticket/10637/beautifulsoup-3.2.0.p0.spkg |
| 27 | """ |
| 28 | |
| 29 | def preprocess_display_latex(text): |
| 30 | r"""replace $$some display latex$$ with <display>some display latex</display> |
| 31 | before the soup is built. |
| 32 | |
| 33 | Deals with the situation where <p></p> tags are mixed |
| 34 | with $$, like $$<p>display_latex$$</p>, unless the mess is huge |
| 35 | |
| 36 | EXAMPLES:: |
| 37 | |
| 38 | sage: from sagenb.misc.comments2rst import preprocess_display_latex |
| 39 | sage: s="$$a=2$$" |
| 40 | sage: preprocess_display_latex(s) |
| 41 | '<display>a=2</display>' |
| 42 | sage: s="<p>$$a=2$$</p>" |
| 43 | sage: preprocess_display_latex(s) |
| 44 | '<p><display>a=2</display></p>' |
| 45 | sage: s="<p>$$a=2</p>$$" |
| 46 | sage: preprocess_display_latex(s) |
| 47 | '<p><display>a=2</display></p>' |
| 48 | sage: s="$$<p>a=2</p>$$" |
| 49 | sage: preprocess_display_latex(s) |
| 50 | '<display>a=2</display>' |
| 51 | """ |
| 52 | ls = [] |
| 53 | start_tag = True |
| 54 | partes = text.split('$$') |
| 55 | for c in partes[:-1]: |
| 56 | if start_tag: |
| 57 | ls.append(c) |
| 58 | ls.append('<display>') |
| 59 | else: |
| 60 | c0, count = prune_tags(c) |
| 61 | ls.append(c0) |
| 62 | ls.append('</display>') |
| 63 | if count == 1: |
| 64 | ls.append('<p>') |
| 65 | elif count == -1: |
| 66 | ls.append('</p>') |
| 67 | elif abs(count)>1: |
| 68 | raise Exception, 'display latex was messed up with html code' |
| 69 | start_tag = not start_tag |
| 70 | ls.append(partes[-1]) |
| 71 | return ''.join(ls) |
| 72 | |
| 73 | def prune_tags(text): |
| 74 | count = text.count('<p>') - text.count('</p>') |
| 75 | return text.replace('<br/>','').replace('<br />','').replace('<p>','').replace('</p>',''), count |
| 76 | |
| 77 | escapable_chars = { '+' :r'\+', |
| 78 | '*' :r'\*', |
| 79 | '|' :r'\|', |
| 80 | '-' :r'\-'} |
| 81 | def escape_chars(text): |
| 82 | for c,r in escapable_chars.iteritems(): |
| 83 | text = text.replace(c,r) |
| 84 | return text |
| 85 | |
| 86 | def replace_courier(soup): |
| 87 | """Lacking a better option, I use courier font to mark <code> |
| 88 | within tinyMCE. And I want to turn that into real code tags. |
| 89 | |
| 90 | Most users won't be needing this(?) |
| 91 | """ |
| 92 | for t in soup.findAll(lambda s:s.has_key('style') and 'courier' in s['style']): |
| 93 | tag = Tag(soup, 'code') |
| 94 | while t.contents: |
| 95 | tag.append(t.contents[0]) |
| 96 | t.replaceWith(tag) |
| 97 | |
| 98 | #inline_latex is careful not to confuse escaped dollars |
| 99 | inline_latex = re.compile(r'([^\\])\$(.*?)([^\\])\$') |
| 100 | latex_beginning = re.compile(r'\$(.*?)([^\\])\$') |
| 101 | def replace_latex(soup): |
| 102 | r"""Replaces inline latex by :math:`code` and escapes |
| 103 | some rst special chars like +, -, * and | outside of inline latex |
| 104 | |
| 105 | does not escape chars inside display or pre tags |
| 106 | |
| 107 | EXAMPLES:: |
| 108 | |
| 109 | sage: from sagenb.misc.comments2rst import replace_latex |
| 110 | sage: from BeautifulSoup import ICantBelieveItsBeautifulSoup |
| 111 | sage: s = ICantBelieveItsBeautifulSoup("<p>Some <strong>latex: $e^\pi i=-1$</strong></p>") |
| 112 | sage: replace_latex(s) |
| 113 | sage: s |
| 114 | <p>Some <strong>latex: :math:`e^\pi i=-1`</strong></p> |
| 115 | sage: s = ICantBelieveItsBeautifulSoup("<p><strong>2+2 | 1+3</strong></p>") |
| 116 | sage: replace_latex(s) |
| 117 | sage: s |
| 118 | <p><strong>2\+2 \| 1\+3</strong></p> |
| 119 | """ |
| 120 | for t in soup.findAll(text=re.compile('.+')): |
| 121 | if latex_beginning.match(t): |
| 122 | t.replaceWith(inline_latex.sub('\\1:math:`\\2\\3`', |
| 123 | latex_beginning.sub(':math:`\\1\\2`', |
| 124 | unicode(t), |
| 125 | 1))) |
| 126 | elif inline_latex.search(t): |
| 127 | t.replaceWith(inline_latex.sub('\\1:math:`\\2\\3`', |
| 128 | unicode(t))) |
| 129 | elif not (t.fetchParents(name = 'display') |
| 130 | or t.fetchParents(name = 'pre')): |
| 131 | t.replaceWith(escape_chars(t)) |
| 132 | |
| 133 | class Soup2Rst(object): |
| 134 | """builds the rst text from the Soup Tree |
| 135 | """ |
| 136 | tags = {'h1':'header', |
| 137 | 'h2':'header', |
| 138 | 'h3':'header', |
| 139 | 'h4':'header', |
| 140 | 'p': 'inline_no_tag', |
| 141 | '[document]': 'document', |
| 142 | 'br': 'br', |
| 143 | 'b':'strong', |
| 144 | 'strong':'strong', |
| 145 | 'em':'em', |
| 146 | 'pre':'pre', |
| 147 | 'code':'code', |
| 148 | 'display':'display', |
| 149 | 'span':'inline_no_tag', |
| 150 | 'ul':'ul', |
| 151 | 'ol':'ol', |
| 152 | 'li':'li', |
| 153 | 'a':'a', |
| 154 | 'table':'table', |
| 155 | # 'tr':'tr', |
| 156 | 'td':'inline_no_tag', |
| 157 | 'th':'inline_no_tag', |
| 158 | 'tt':'inline_no_tag', |
| 159 | 'div':'block_no_tag', |
| 160 | 'img':'img', |
| 161 | # '':'', |
| 162 | } |
| 163 | |
| 164 | headers = {'h1':u'=', |
| 165 | 'h2':u'-', |
| 166 | 'h3':u'~', |
| 167 | 'h4':u'"', |
| 168 | } |
| 169 | |
| 170 | def __init__(self, images_dir): |
| 171 | self.images_dir = images_dir |
| 172 | self._nested_list = 0 |
| 173 | self._inside_ol = False |
| 174 | self._inside_code_tag = False |
| 175 | |
| 176 | def visit(self, node): |
| 177 | if isinstance(node, (CData, Comment, Declaration, ProcessingInstruction)): |
| 178 | return '' |
| 179 | elif hasattr(node, 'name'): |
| 180 | try: |
| 181 | visitor = getattr(self, 'visit_' + self.tags[node.name]) |
| 182 | return visitor(node) |
| 183 | except (KeyError, AttributeError): |
| 184 | print 'Warning: node not supported (or something else?) ' + node.name |
| 185 | return unicode(node) |
| 186 | else: |
| 187 | #Assume plain string |
| 188 | return unicode(node).replace('\n','') |
| 189 | |
| 190 | def visit_document(self, node): |
| 191 | return '\n'.join(self.visit(tag) for tag in node.contents) |
| 192 | |
| 193 | def get_plain_text(self, node): |
| 194 | """Gets all text, removing all tags""" |
| 195 | if hasattr(node, 'contents'): |
| 196 | t = ' '.join(self.get_plain_text(tag) for tag in node.contents) |
| 197 | else: |
| 198 | t = unicode(node) |
| 199 | return t.replace('\n','') |
| 200 | |
| 201 | def visit_header(self, node): |
| 202 | s = ' '.join(self.visit(tag) for tag in node.contents) |
| 203 | spacer = self.headers[node.name]*len(s) |
| 204 | return s.replace( '\n', '') + '\n' + spacer |
| 205 | |
| 206 | def visit_pre(self, node): |
| 207 | return '::\n\n '+unicode(node)[5:-6].replace('<br />','\n').replace('<br></br>','\n').replace('\n','\n ') |
| 208 | |
| 209 | def visit_ul(self, node): |
| 210 | self._nested_list += 1 |
| 211 | result = '\n'.join(self.visit(tag) for tag in node.contents) |
| 212 | self._nested_list -= 1 |
| 213 | return result |
| 214 | |
| 215 | def visit_ol(self, node): |
| 216 | self._nested_list += 1 |
| 217 | self._inside_ol = True |
| 218 | result = '\n'.join(self.visit(tag) for tag in node.contents) |
| 219 | self._nested_list -= 1 |
| 220 | self._inside_ol = False |
| 221 | return result |
| 222 | |
| 223 | def visit_li(self, node): |
| 224 | return (' '*self._nested_list |
| 225 | + ('#. ' if self._inside_ol else '- ') |
| 226 | +' '.join(self.visit(tag) for tag in node.contents)) |
| 227 | |
| 228 | def visit_display(self, node): |
| 229 | return ('\n.. MATH::\n\n ' + |
| 230 | unicode(node)[9:-10].replace('<br></br>','\n').replace('\n','\n ') + |
| 231 | '\n\n') |
| 232 | |
| 233 | def visit_img(self, node): |
| 234 | return '.. image:: ' + os.path.join(self.images_dir, node['src'].replace(' ','_')) + '\n :align: center\n' |
| 235 | |
| 236 | def visit_table(self,node): |
| 237 | rows = [] |
| 238 | for elt in node.contents: |
| 239 | if not hasattr(elt,'name'): |
| 240 | pass |
| 241 | elif elt.name == 'thead': |
| 242 | rows.extend(self.prepare_tr(row) |
| 243 | for row in elt |
| 244 | if hasattr(row,'name') and |
| 245 | row.name=='tr') |
| 246 | rows.append([]) #this row represents a separator |
| 247 | elif elt.name == 'tbody': |
| 248 | rows.extend(self.prepare_tr(row) |
| 249 | for row in elt |
| 250 | if hasattr(row,'name') and |
| 251 | row.name=='tr') |
| 252 | elif elt.name == 'tr': |
| 253 | rows.append(self.prepare_tr(elt)) |
| 254 | |
| 255 | ncols = max(len(row) for row in rows) |
| 256 | for row in rows: |
| 257 | if len(row) < ncols: |
| 258 | row.extend( ['']*(ncols - len(row))) |
| 259 | cols_sizes = [max(len(td) for td in tds_in_col) |
| 260 | for tds_in_col in zip(*rows)] |
| 261 | result = [' '.join('='*c for c in cols_sizes)] |
| 262 | |
| 263 | for row in rows: |
| 264 | if any(td for td in row): |
| 265 | result.append(' '.join(td+' '*(l - len(td)) |
| 266 | for l,td in zip(cols_sizes,row))) |
| 267 | else: |
| 268 | result.append(' '.join('-'*c for c in cols_sizes)) |
| 269 | result.append(' '.join('='*c for c in cols_sizes)) |
| 270 | return '\n'.join(result) |
| 271 | |
| 272 | def prepare_tr(self, node): |
| 273 | return [self.visit(tag) for tag in node.contents if tag!='\n'] |
| 274 | |
| 275 | def visit_br(self, node): |
| 276 | return '\n' |
| 277 | |
| 278 | def visit_strong(self, node): |
| 279 | if node.contents: |
| 280 | content = ' '.join(self.visit(tag) for tag in node.contents).strip() |
| 281 | if '``' in content or self._inside_code_tag: |
| 282 | return content |
| 283 | else: |
| 284 | return '**' + content + '**' |
| 285 | else: |
| 286 | return '' |
| 287 | |
| 288 | def visit_em(self,node): |
| 289 | if node.contents: |
| 290 | return '*' + ' '.join(self.visit(tag) for tag in node.contents).strip() + '*' |
| 291 | else: |
| 292 | return '' |
| 293 | |
| 294 | def visit_code(self, node): |
| 295 | if node.contents: |
| 296 | self._inside_code_tag = True |
| 297 | content = self.get_plain_text(node).strip() |
| 298 | self._inside_code_tag = False |
| 299 | return '``' + content + '``' |
| 300 | else: |
| 301 | return '' |
| 302 | |
| 303 | def visit_inline_no_tag(self, node): |
| 304 | return (' '.join(self.visit(tag) |
| 305 | for tag in node.contents)).strip() + '\n' |
| 306 | |
| 307 | def visit_block_no_tag(self, node): |
| 308 | return '\n'.join(self.visit(tag) for tag in node.contents) |
| 309 | |
| 310 | def visit_a(self, node): |
| 311 | return ('`' + ' '.join(self.visit(tag) for tag in node.contents) + |
| 312 | ' <' + node['href'] + '>`_' |
| 313 | ) |
| 314 | |
| 315 | def html2rst(text, images_dir): |
| 316 | """Converts html, tipically generated by tinyMCE, into rst |
| 317 | compatible with Sage documentation. |
| 318 | |
| 319 | The main job is done by BeautifulSoup, which is much more |
| 320 | robust than conventional parsers like HTMLParser, but also |
| 321 | several details specific of this context are taken into |
| 322 | account, so this code differs from generic approaches like |
| 323 | those found on the web. |
| 324 | |
| 325 | INPUT: |
| 326 | |
| 327 | - ``text`` -- string -- a chunk of HTML text |
| 328 | |
| 329 | - ``images_dir`` -- string -- folder where images are stored |
| 330 | |
| 331 | OUTPUT: |
| 332 | |
| 333 | - string -- rst text |
| 334 | |
| 335 | EXAMPLES:: |
| 336 | |
| 337 | sage: from sagenb.misc.comments2rst import html2rst |
| 338 | sage: html2rst('<p>Some text with <em>math</em>: $e^{\pi i}=-1$</p>', '') |
| 339 | u'Some text with *math* : :math:`e^{\\pi i}=-1`\n' |
| 340 | sage: html2rst('<p>Text with <em>incorrect</p> nesting</em>.', '') |
| 341 | u'Text with *incorrect*\n\n nesting\n.' |
| 342 | sage: html2rst('<pre>Preformatted: \n a+2\n</pre><p> Not preformatted: \n a+2\n</p>', '') |
| 343 | u'::\n\n Preformatted: \n a+2\n \nNot preformatted: a\\+2\n' |
| 344 | sage: html2rst('áñ ñá','') |
| 345 | u'\xe1\xf1 \xf1\xe1' |
| 346 | sage: html2rst('<p>some text</p><p>$$</p><p>3.183098861 \cdot 10^{-1}</p><p>$$</p>','') |
| 347 | u'some text\n\n.. MATH::\n\n 3.183098861 \\cdot 10^{-1}\n' |
| 348 | """ |
| 349 | |
| 350 | #replace $$some display latex$$ with |
| 351 | #<display>some display latex</display> |
| 352 | text = preprocess_display_latex(text) |
| 353 | |
| 354 | #eliminate nasty |
| 355 | text = text.replace(' ',' ') |
| 356 | |
| 357 | #ICantBelieveItsBeautifulSoup is better than BeautifulSoup |
| 358 | #for html that wasn't generated by humans (like tinyMCE) |
| 359 | soup = ICantBelieveItsBeautifulSoup(text, |
| 360 | convertEntities=ICantBelieveItsBeautifulSoup.HTML_ENTITIES) |
| 361 | |
| 362 | #remove all comments |
| 363 | comments = soup.findAll(text=lambda text:isinstance(text, Comment)) |
| 364 | for comment in comments: |
| 365 | comment.extract() |
| 366 | |
| 367 | replace_courier(soup) |
| 368 | replace_latex(soup) |
| 369 | v = Soup2Rst(images_dir) |
| 370 | return v.visit(soup) |