| 1 | """ |
|---|
| 2 | Processes SAGE documentation into notebook worksheet format with |
|---|
| 3 | evaluatable examples. |
|---|
| 4 | |
|---|
| 5 | This takes in any HTML document, i.e. sage documentation, and returns it in |
|---|
| 6 | the editable format (like the notebook edit window). It also returns a |
|---|
| 7 | string representing the css link for the document. |
|---|
| 8 | The SGML parser is setup to return only the body of the html documentation |
|---|
| 9 | page and to re-format sage examples and type-setting. |
|---|
| 10 | |
|---|
| 11 | Note: |
|---|
| 12 | This extension of sgmllib.SGMLParser was partly inspired by Mark Pilgrim's 'Dive Into Python' examples. |
|---|
| 13 | |
|---|
| 14 | Author: |
|---|
| 15 | -- Dorian Raymer (2006) |
|---|
| 16 | |
|---|
| 17 | |
|---|
| 18 | """ |
|---|
| 19 | |
|---|
| 20 | from sgmllib import SGMLParser |
|---|
| 21 | from urllib import splittag |
|---|
| 22 | from htmlentitydefs import entitydefs |
|---|
| 23 | |
|---|
| 24 | class DocHTMLProcessor(SGMLParser): |
|---|
| 25 | |
|---|
| 26 | def reset(self): |
|---|
| 27 | """ This function is called by SGMLParser.__init__ so all necessary things |
|---|
| 28 | are initiallized here. |
|---|
| 29 | """ |
|---|
| 30 | # flags |
|---|
| 31 | self.bodyQ = False #don't keep anything before the <body> tag |
|---|
| 32 | self.in_verbatim_div = False |
|---|
| 33 | self.in_math_span = False |
|---|
| 34 | self.in_mathdisplay = False |
|---|
| 35 | |
|---|
| 36 | # lists of what the parser keeps |
|---|
| 37 | self.temp_pieces = [] |
|---|
| 38 | # self.all_pieces = [] |
|---|
| 39 | self.all_pieces = '' |
|---|
| 40 | self.css_href = None |
|---|
| 41 | |
|---|
| 42 | # counters |
|---|
| 43 | self.cellcount = 0 |
|---|
| 44 | self.allcount = 0 |
|---|
| 45 | |
|---|
| 46 | SGMLParser.reset(self) |
|---|
| 47 | |
|---|
| 48 | def process_doc_html(self, doc_path, full_path, doc_in): |
|---|
| 49 | """process_doc_html is the only function that needs to be called externally. |
|---|
| 50 | docin should be a properly marked up html file. |
|---|
| 51 | doc_folder tells what part of the documentation (''=main index, ref = reference, tut=tutorial, etc.) |
|---|
| 52 | self.feed() is a SGMLParser method and starts everything off; Most of the functions here |
|---|
| 53 | are extensions to SGMLParser, and may never actually be visibly called here. |
|---|
| 54 | """ |
|---|
| 55 | self.doc_path = doc_path |
|---|
| 56 | self.full_path = full_path |
|---|
| 57 | self.feed(doc_in) #SGMLParser call |
|---|
| 58 | self.close() #SGMLParser call |
|---|
| 59 | self.hand_off_temp_pieces('to_doc_pieces') |
|---|
| 60 | self.all_pieces = self.all_pieces[:-16] # drop </body></html> |
|---|
| 61 | return self.all_pieces, self.css_href # The goods |
|---|
| 62 | |
|---|
| 63 | |
|---|
| 64 | def hand_off_temp_pieces(self, piece_type): |
|---|
| 65 | """ To seperate documentation content from sage examples, everything is split into one of two cell types. |
|---|
| 66 | This function is called to put the current self.temp_pieces into self.all_pieces. |
|---|
| 67 | """ |
|---|
| 68 | pieces = "".join(self.temp_pieces) |
|---|
| 69 | pieces = pieces.lstrip() |
|---|
| 70 | if piece_type=='to_doc_pieces': |
|---|
| 71 | # pieces = '%html\n' + pieces |
|---|
| 72 | # self.all_pieces.append(pieces) |
|---|
| 73 | self.all_pieces += pieces |
|---|
| 74 | self.temp_pieces = [] |
|---|
| 75 | else: |
|---|
| 76 | pieces = self.process_cell_input_output(pieces) |
|---|
| 77 | # self.all_pieces.append(pieces) |
|---|
| 78 | self.all_pieces += pieces |
|---|
| 79 | self.temp_pieces = [] |
|---|
| 80 | self.allcount += 1 |
|---|
| 81 | |
|---|
| 82 | def process_cell_input_output(self, cell_piece): |
|---|
| 83 | """ |
|---|
| 84 | All class='verbatim' div's contain code examples. |
|---|
| 85 | Some examples are models of how the function works; |
|---|
| 86 | those begin with INPUT: or something. |
|---|
| 87 | The rest of the examples should have sage:input and |
|---|
| 88 | output. If the example is a model, it is made into a |
|---|
| 89 | div class='usage_model' so it can be stylized. |
|---|
| 90 | If it is actuall input/output, the input is seperated |
|---|
| 91 | from the output according to the Notebook edit format. |
|---|
| 92 | """ |
|---|
| 93 | if cell_piece[:5] != 'sage:' and cell_piece[:12] != '>'*3: |
|---|
| 94 | piece = '<div class="verbatim"><pre>' |
|---|
| 95 | piece += cell_piece |
|---|
| 96 | piece = piece.replace('{','{ ') |
|---|
| 97 | piece = piece.replace('}','} ') |
|---|
| 98 | piece += '</pre></div>' |
|---|
| 99 | else: |
|---|
| 100 | # group and format inputs and outputs |
|---|
| 101 | pieces = cell_piece.split('\n') |
|---|
| 102 | output_flag = False |
|---|
| 103 | piece = '{{{\n' |
|---|
| 104 | for p in pieces: |
|---|
| 105 | p = p.lstrip() |
|---|
| 106 | |
|---|
| 107 | if p[:5] == 'sage:' and not output_flag: |
|---|
| 108 | piece += p[5:].lstrip() + '\n' |
|---|
| 109 | elif p[:5] == 'sage:' and output_flag: |
|---|
| 110 | piece += '}}}\n{{{\n' + p[5:].lstrip() + '\n' |
|---|
| 111 | output_flag = False |
|---|
| 112 | elif p[:12] == '>'*3 and not output_flag: |
|---|
| 113 | piece += p[12:].lstrip() + '\n' |
|---|
| 114 | elif p[:12] == '>'*3 and output_flag: |
|---|
| 115 | piece += '}}}\n{{{\n' + p[12:].lstrip() + '\n' |
|---|
| 116 | output_flag = False |
|---|
| 117 | elif p[:3] == '...': |
|---|
| 118 | piece += p[3:] + '\n' |
|---|
| 119 | else: |
|---|
| 120 | # first occurrence of an output string |
|---|
| 121 | # write /// denoting output |
|---|
| 122 | if output_flag == False: |
|---|
| 123 | piece += '///\n' |
|---|
| 124 | piece += p.lstrip() + '\n' |
|---|
| 125 | output_flag = True |
|---|
| 126 | # multiple output lines exist, don't need /// repeated |
|---|
| 127 | else: |
|---|
| 128 | piece += p.lstrip() + '\n' |
|---|
| 129 | piece += '}}}\n' |
|---|
| 130 | return piece |
|---|
| 131 | |
|---|
| 132 | |
|---|
| 133 | |
|---|
| 134 | def rewrite_href(self,href_value): |
|---|
| 135 | # hack to make the hrefs work. |
|---|
| 136 | href_value, href_tag = splittag(href_value) |
|---|
| 137 | href_split = href_value.split('/') |
|---|
| 138 | full_path = self.full_path |
|---|
| 139 | if len(href_split) > 1: |
|---|
| 140 | path = '/'.join(href_split[:-1]) + '/' |
|---|
| 141 | full_path += path |
|---|
| 142 | file_name = href_split[-1] |
|---|
| 143 | else: |
|---|
| 144 | file_name = href_value |
|---|
| 145 | # parts = '' |
|---|
| 146 | # for part in href_split: |
|---|
| 147 | # if part == '..': |
|---|
| 148 | # poptart = full_path.pop(-1) |
|---|
| 149 | # else: |
|---|
| 150 | # parts += '/' + part |
|---|
| 151 | url_path = '/doc_browser?' + full_path + '?' |
|---|
| 152 | |
|---|
| 153 | if href_tag: |
|---|
| 154 | href_new = url_path + file_name + '#' + href_tag |
|---|
| 155 | else: |
|---|
| 156 | href_new = url_path + file_name |
|---|
| 157 | |
|---|
| 158 | return href_new |
|---|
| 159 | |
|---|
| 160 | def rewrite_src(self, src_value): |
|---|
| 161 | # src_split = src_value.split('/') |
|---|
| 162 | # full_path = self.full_path.split('/') |
|---|
| 163 | # for part in src_split: |
|---|
| 164 | return src_value.lstrip('..') |
|---|
| 165 | |
|---|
| 166 | |
|---|
| 167 | ############################################## |
|---|
| 168 | ## General tag handlers |
|---|
| 169 | ## |
|---|
| 170 | |
|---|
| 171 | def unknown_starttag(self, tag, attrs): |
|---|
| 172 | if self.bodyQ: |
|---|
| 173 | strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs]) |
|---|
| 174 | self.temp_pieces.append("<%(tag)s%(strattrs)s>" % locals()) |
|---|
| 175 | |
|---|
| 176 | def unknown_endtag(self, tag): |
|---|
| 177 | if self.bodyQ: |
|---|
| 178 | self.temp_pieces.append("</%(tag)s>" % locals()) |
|---|
| 179 | |
|---|
| 180 | def handle_data(self, data): |
|---|
| 181 | if self.bodyQ: |
|---|
| 182 | self.temp_pieces.append(data) |
|---|
| 183 | |
|---|
| 184 | def handle_charref(self, ref): |
|---|
| 185 | if self.bodyQ: |
|---|
| 186 | self.temp_pieces.append("&#%(ref)s;" % locals()) |
|---|
| 187 | |
|---|
| 188 | def handle_entityref(self, ref): |
|---|
| 189 | if self.bodyQ: |
|---|
| 190 | self.temp_pieces.append("&%(ref)s" % locals()) |
|---|
| 191 | if entitydefs.has_key(ref): |
|---|
| 192 | self.temp_pieces.append(';') |
|---|
| 193 | |
|---|
| 194 | def handle_comment(self, data): |
|---|
| 195 | if self.bodyQ: |
|---|
| 196 | self.temp_pieces.append("<!--%(data)s-->" % locals()) |
|---|
| 197 | |
|---|
| 198 | def handle_pi(self, text): |
|---|
| 199 | if self.bodyQ: |
|---|
| 200 | self.temp_pieces.append("<?%(text)s>" % locals()) |
|---|
| 201 | |
|---|
| 202 | def handle_decl(self, text): |
|---|
| 203 | if self.bodyQ: |
|---|
| 204 | self.temp_pieces.append("<!%(text)s>" % locals()) |
|---|
| 205 | |
|---|
| 206 | |
|---|
| 207 | ############################################# |
|---|
| 208 | ## Specific tag handlers |
|---|
| 209 | ## |
|---|
| 210 | |
|---|
| 211 | def start_link(self, attrs): |
|---|
| 212 | rel = [value.lower() for key, value in attrs if key=='rel'] |
|---|
| 213 | href = [value for key, value in attrs if key=='href'] |
|---|
| 214 | if 'stylesheet' in rel: |
|---|
| 215 | self.css_href = href[0] |
|---|
| 216 | |
|---|
| 217 | |
|---|
| 218 | |
|---|
| 219 | def start_body(self, attrs): |
|---|
| 220 | self.bodyQ = True |
|---|
| 221 | |
|---|
| 222 | def start_a(self, attrs): |
|---|
| 223 | if self.bodyQ: |
|---|
| 224 | count = 0 |
|---|
| 225 | for name, value in attrs: |
|---|
| 226 | if name.lower()=='href': |
|---|
| 227 | href_new = self.rewrite_href(value) |
|---|
| 228 | attrs[count] = ('href', href_new) |
|---|
| 229 | count += 1 |
|---|
| 230 | self.unknown_starttag('a', attrs) |
|---|
| 231 | |
|---|
| 232 | |
|---|
| 233 | def start_div(self, attrs): |
|---|
| 234 | |
|---|
| 235 | for name, value in attrs: |
|---|
| 236 | if name.lower()=='class' and value.lower()=='verbatim': |
|---|
| 237 | self.in_verbatim_div = True |
|---|
| 238 | return |
|---|
| 239 | if name.lower()=='class' and value.lower()=='mathdisplay': |
|---|
| 240 | self.in_mathdisplay = True #left off here |
|---|
| 241 | self.unknown_starttag('div', attrs) |
|---|
| 242 | |
|---|
| 243 | |
|---|
| 244 | def end_div(self): |
|---|
| 245 | if self.in_verbatim_div: |
|---|
| 246 | #self.temp_pieces.append(" }}} ") |
|---|
| 247 | self.in_verbatim_div = False |
|---|
| 248 | self.hand_off_temp_pieces('to_cell_pieces') |
|---|
| 249 | return |
|---|
| 250 | self.temp_pieces.append("</div>") |
|---|
| 251 | |
|---|
| 252 | def start_pre(self, attrs): |
|---|
| 253 | if self.in_verbatim_div: |
|---|
| 254 | self.hand_off_temp_pieces('to_doc_pieces') |
|---|
| 255 | self.cellcount += 1 |
|---|
| 256 | else: |
|---|
| 257 | self.unknown_starttag('pre',attrs) |
|---|
| 258 | |
|---|
| 259 | def end_pre(self): |
|---|
| 260 | if not self.in_verbatim_div: |
|---|
| 261 | self.unknown_endtag('pre') |
|---|
| 262 | |
|---|
| 263 | def start_span(self, attrs): |
|---|
| 264 | count = 0 |
|---|
| 265 | for name, value in attrs: |
|---|
| 266 | if name.lower()=='class' and value.lower()=='math': |
|---|
| 267 | self.in_math_span = True |
|---|
| 268 | attrs[count] = ('class','math') |
|---|
| 269 | count += 1 |
|---|
| 270 | self.unknown_starttag('span', attrs) |
|---|
| 271 | |
|---|
| 272 | def end_span(self): |
|---|
| 273 | if self.in_math_span: |
|---|
| 274 | self.in_math_span = False |
|---|
| 275 | self.unknown_endtag('span') |
|---|
| 276 | |
|---|
| 277 | def start_img(self, attrs): |
|---|
| 278 | # if in a span with class=math, |
|---|
| 279 | # remove the following img, and just print the alt attribute |
|---|
| 280 | if self.bodyQ: |
|---|
| 281 | if self.in_math_span: |
|---|
| 282 | for name,value in attrs: |
|---|
| 283 | if name.lower()=='alt': |
|---|
| 284 | # value = value.replace('$','\\$') |
|---|
| 285 | tex = value |
|---|
| 286 | # tex = '\\text{' + value + '}' |
|---|
| 287 | self.temp_pieces.append(tex) |
|---|
| 288 | return |
|---|
| 289 | count = 0 |
|---|
| 290 | for name, value in attrs: |
|---|
| 291 | if name.lower()=='src': |
|---|
| 292 | # attrs[count] = ('src',self.doc_path + '/' + value.lstrip('..')) |
|---|
| 293 | attrs[count] = ('src',self.doc_path + self.full_path + value) |
|---|
| 294 | count += 1 |
|---|
| 295 | strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs]) |
|---|
| 296 | self.temp_pieces.append("<img %(strattrs)s>" % locals()) |
|---|
| 297 | |
|---|
| 298 | |
|---|
| 299 | |
|---|
| 300 | def end_img(self): |
|---|
| 301 | if self.in_math_span == 1: |
|---|
| 302 | return |
|---|
| 303 | self.unknown_endtag('img') |
|---|
| 304 | |
|---|
| 305 | |
|---|