1 | # -*- coding: utf-8 -*-␊ |
2 | #!/usr/bin/env python␊ |
3 | # Copyright (c) 2012 Trent Mick.␊ |
4 | # Copyright (c) 2007-2008 ActiveState Corp.␊ |
5 | # License: MIT (http://www.opensource.org/licenses/mit-license.php)␊ |
6 | ␊ |
7 | from __future__ import generators␊ |
8 | ␊ |
9 | r"""A fast and complete Python implementation of Markdown.␊ |
10 | ␊ |
11 | [from http://daringfireball.net/projects/markdown/]␊ |
12 | > Markdown is a text-to-HTML filter; it translates an easy-to-read /␊ |
13 | > easy-to-write structured text format into HTML. Markdown's text␊ |
14 | > format is most similar to that of plain text email, and supports␊ |
15 | > features such as headers, *emphasis*, code blocks, blockquotes, and␊ |
16 | > links.␊ |
17 | >␊ |
18 | > Markdown's syntax is designed not as a generic markup language, but␊ |
19 | > specifically to serve as a front-end to (X)HTML. You can use span-level␊ |
20 | > HTML tags anywhere in a Markdown document, and you can use block level␊ |
21 | > HTML tags (like <div> and <table> as well).␊ |
22 | ␊ |
23 | Module usage:␊ |
24 | ␊ |
25 | >>> import markdown2␊ |
26 | >>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)`␊ |
27 | u'<p><em>boo!</em></p>\n'␊ |
28 | ␊ |
29 | >>> markdowner = Markdown()␊ |
30 | >>> markdowner.convert("*boo!*")␊ |
31 | u'<p><em>boo!</em></p>\n'␊ |
32 | >>> markdowner.convert("**boom!**")␊ |
33 | u'<p><strong>boom!</strong></p>\n'␊ |
34 | ␊ |
35 | This implementation of Markdown implements the full "core" syntax plus a␊ |
36 | number of extras (e.g., code syntax coloring, footnotes) as described on␊ |
37 | <https://github.com/trentm/python-markdown2/wiki/Extras>.␊ |
38 | """␊ |
39 | ␊ |
40 | cmdln_desc = """A fast and complete Python implementation of Markdown, a␊ |
41 | text-to-HTML conversion tool for web writers.␊ |
42 | ␊ |
43 | Supported extra syntax options (see -x|--extras option below and␊ |
44 | see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):␊ |
45 | ␊ |
46 | * code-friendly: Disable _ and __ for em and strong.␊ |
47 | * cuddled-lists: Allow lists to be cuddled to the preceding paragraph.␊ |
48 | * fenced-code-blocks: Allows a code block to not have to be indented␊ |
49 | by fencing it with '```' on a line before and after. Based on␊ |
50 | <http://github.github.com/github-flavored-markdown/> with support for␊ |
51 | syntax highlighting.␊ |
52 | * footnotes: Support footnotes as in use on daringfireball.net and␊ |
53 | implemented in other Markdown processors (tho not in Markdown.pl v1.0.1).␊ |
54 | * header-ids: Adds "id" attributes to headers. The id value is a slug of␊ |
55 | the header text.␊ |
56 | * html-classes: Takes a dict mapping html tag names (lowercase) to a␊ |
57 | string to use for a "class" tag attribute. Currently only supports␊ |
58 | "pre" and "code" tags. Add an issue if you require this for other tags.␊ |
59 | * markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to␊ |
60 | have markdown processing be done on its contents. Similar to␊ |
61 | <http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with␊ |
62 | some limitations.␊ |
63 | * metadata: Extract metadata from a leading '---'-fenced block.␊ |
64 | See <https://github.com/trentm/python-markdown2/issues/77> for details.␊ |
65 | * nofollow: Add `rel="nofollow"` to add `<a>` tags with an href. See␊ |
66 | <http://en.wikipedia.org/wiki/Nofollow>.␊ |
67 | * pyshell: Treats unindented Python interactive shell sessions as <code>␊ |
68 | blocks.␊ |
69 | * link-patterns: Auto-link given regex patterns in text (e.g. bug number␊ |
70 | references, revision number references).␊ |
71 | * smarty-pants: Replaces ' and " with curly quotation marks or curly␊ |
72 | apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,␊ |
73 | and ellipses.␊ |
74 | * toc: The returned HTML string gets a new "toc_html" attribute which is␊ |
75 | a Table of Contents for the document. (experimental)␊ |
76 | * xml: Passes one-liner processing instructions and namespaced XML tags.␊ |
77 | * wiki-tables: Google Code Wiki-style tables. See␊ |
78 | <http://code.google.com/p/support/wiki/WikiSyntax#Tables>.␊ |
79 | """␊ |
80 | ␊ |
81 | # Dev Notes:␊ |
82 | # - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm␊ |
83 | # not yet sure if there implications with this. Compare 'pydoc sre'␊ |
84 | # and 'perldoc perlre'.␊ |
85 | ␊ |
86 | __version_info__ = (2, 1, 1)␊ |
87 | __version__ = '.'.join(map(str, __version_info__))␊ |
88 | __author__ = "Trent Mick"␊ |
89 | ␊ |
90 | import os␊ |
91 | import sys␊ |
92 | from pprint import pprint␊ |
93 | import re␊ |
94 | import logging␊ |
95 | try:␊ |
96 | from hashlib import md5␊ |
97 | except ImportError:␊ |
98 | from md5 import md5␊ |
99 | import optparse␊ |
100 | from random import random, randint␊ |
101 | import codecs␊ |
102 | ␊ |
103 | ␊ |
104 | #---- Python version compat␊ |
105 | ␊ |
106 | try:␊ |
107 | from urllib.parse import quote # python3␊ |
108 | except ImportError:␊ |
109 | from urllib import quote # python2␊ |
110 | ␊ |
111 | if sys.version_info[:2] < (2,4):␊ |
112 | from sets import Set as set␊ |
113 | def reversed(sequence):␊ |
114 | for i in sequence[::-1]:␊ |
115 | yield i␊ |
116 | ␊ |
117 | # Use `bytes` for byte strings and `unicode` for unicode strings (str in Py3).␊ |
118 | if sys.version_info[0] <= 2:␊ |
119 | py3 = False␊ |
120 | try:␊ |
121 | bytes␊ |
122 | except NameError:␊ |
123 | bytes = str␊ |
124 | base_string_type = basestring␊ |
125 | elif sys.version_info[0] >= 3:␊ |
126 | py3 = True␊ |
127 | unicode = str␊ |
128 | base_string_type = str␊ |
129 | ␊ |
130 | ␊ |
131 | ␊ |
132 | #---- globals␊ |
133 | ␊ |
134 | DEBUG = False␊ |
135 | log = logging.getLogger("markdown")␊ |
136 | ␊ |
137 | DEFAULT_TAB_WIDTH = 4␊ |
138 | ␊ |
139 | ␊ |
140 | SECRET_SALT = bytes(randint(0, 1000000))␊ |
141 | def _hash_text(s):␊ |
142 | return 'md5-' + md5(SECRET_SALT + s.encode("utf-8")).hexdigest()␊ |
143 | ␊ |
144 | # Table of hash values for escaped characters:␊ |
145 | g_escape_table = dict([(ch, _hash_text(ch))␊ |
146 | for ch in '\\`*_{}[]()>#+-.!'])␊ |
147 | ␊ |
148 | ␊ |
149 | ␊ |
150 | #---- exceptions␊ |
151 | ␊ |
152 | class MarkdownError(Exception):␊ |
153 | pass␊ |
154 | ␊ |
155 | ␊ |
156 | ␊ |
157 | #---- public api␊ |
158 | ␊ |
159 | def markdown_path(path, encoding="utf-8",␊ |
160 | html4tags=False, tab_width=DEFAULT_TAB_WIDTH,␊ |
161 | safe_mode=None, extras=None, link_patterns=None,␊ |
162 | use_file_vars=False):␊ |
163 | fp = codecs.open(path, 'r', encoding)␊ |
164 | text = fp.read()␊ |
165 | fp.close()␊ |
166 | return Markdown(html4tags=html4tags, tab_width=tab_width,␊ |
167 | safe_mode=safe_mode, extras=extras,␊ |
168 | link_patterns=link_patterns,␊ |
169 | use_file_vars=use_file_vars).convert(text)␊ |
170 | ␊ |
171 | def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,␊ |
172 | safe_mode=None, extras=None, link_patterns=None,␊ |
173 | use_file_vars=False):␊ |
174 | return Markdown(html4tags=html4tags, tab_width=tab_width,␊ |
175 | safe_mode=safe_mode, extras=extras,␊ |
176 | link_patterns=link_patterns,␊ |
177 | use_file_vars=use_file_vars).convert(text)␊ |
178 | ␊ |
179 | class Markdown(object):␊ |
180 | # The dict of "extras" to enable in processing -- a mapping of␊ |
181 | # extra name to argument for the extra. Most extras do not have an␊ |
182 | # argument, in which case the value is None.␊ |
183 | #␊ |
184 | # This can be set via (a) subclassing and (b) the constructor␊ |
185 | # "extras" argument.␊ |
186 | extras = None␊ |
187 | ␊ |
188 | urls = None␊ |
189 | titles = None␊ |
190 | html_blocks = None␊ |
191 | html_spans = None␊ |
192 | html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py␊ |
193 | ␊ |
194 | # Used to track when we're inside an ordered or unordered list␊ |
195 | # (see _ProcessListItems() for details):␊ |
196 | list_level = 0␊ |
197 | ␊ |
198 | _ws_only_line_re = re.compile(r"^[ \t]+$", re.M)␊ |
199 | ␊ |
200 | def __init__(self, html4tags=False, tab_width=4, safe_mode=None,␊ |
201 | extras=None, link_patterns=None, use_file_vars=False):␊ |
202 | if html4tags:␊ |
203 | self.empty_element_suffix = ">"␊ |
204 | else:␊ |
205 | self.empty_element_suffix = " />"␊ |
206 | self.tab_width = tab_width␊ |
207 | ␊ |
208 | # For compatibility with earlier markdown2.py and with␊ |
209 | # markdown.py's safe_mode being a boolean,␊ |
210 | # safe_mode == True -> "replace"␊ |
211 | if safe_mode is True:␊ |
212 | self.safe_mode = "replace"␊ |
213 | else:␊ |
214 | self.safe_mode = safe_mode␊ |
215 | ␊ |
216 | # Massaging and building the "extras" info.␊ |
217 | if self.extras is None:␊ |
218 | self.extras = {}␊ |
219 | elif not isinstance(self.extras, dict):␊ |
220 | self.extras = dict([(e, None) for e in self.extras])␊ |
221 | if extras:␊ |
222 | if not isinstance(extras, dict):␊ |
223 | extras = dict([(e, None) for e in extras])␊ |
224 | self.extras.update(extras)␊ |
225 | assert isinstance(self.extras, dict)␊ |
226 | if "toc" in self.extras and not "header-ids" in self.extras:␊ |
227 | self.extras["header-ids"] = None # "toc" implies "header-ids"␊ |
228 | self._instance_extras = self.extras.copy()␊ |
229 | ␊ |
230 | self.link_patterns = link_patterns␊ |
231 | self.use_file_vars = use_file_vars␊ |
232 | self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M)␊ |
233 | ␊ |
234 | self._escape_table = g_escape_table.copy()␊ |
235 | if "smarty-pants" in self.extras:␊ |
236 | self._escape_table['"'] = _hash_text('"')␊ |
237 | self._escape_table["'"] = _hash_text("'")␊ |
238 | ␊ |
239 | def reset(self):␊ |
240 | self.urls = {}␊ |
241 | self.titles = {}␊ |
242 | self.html_blocks = {}␊ |
243 | self.html_spans = {}␊ |
244 | self.list_level = 0␊ |
245 | self.extras = self._instance_extras.copy()␊ |
246 | if "footnotes" in self.extras:␊ |
247 | self.footnotes = {}␊ |
248 | self.footnote_ids = []␊ |
249 | if "header-ids" in self.extras:␊ |
250 | self._count_from_header_id = {} # no `defaultdict` in Python 2.4␊ |
251 | if "metadata" in self.extras:␊ |
252 | self.metadata = {}␊ |
253 | ␊ |
254 | # Per <https://developer.mozilla.org/en-US/docs/HTML/Element/a> "rel"␊ |
255 | # should only be used in <a> tags with an "href" attribute.␊ |
256 | _a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE)␊ |
257 | ␊ |
258 | def convert(self, text):␊ |
259 | """Convert the given text."""␊ |
260 | # Main function. The order in which other subs are called here is␊ |
261 | # essential. Link and image substitutions need to happen before␊ |
262 | # _EscapeSpecialChars(), so that any *'s or _'s in the <a>␊ |
263 | # and <img> tags get encoded.␊ |
264 | ␊ |
265 | # Clear the global hashes. If we don't clear these, you get conflicts␊ |
266 | # from other articles when generating a page which contains more than␊ |
267 | # one article (e.g. an index page that shows the N most recent␊ |
268 | # articles):␊ |
269 | self.reset()␊ |
270 | ␊ |
271 | if not isinstance(text, unicode):␊ |
272 | #TODO: perhaps shouldn't presume UTF-8 for string input?␊ |
273 | text = unicode(text, 'utf-8')␊ |
274 | ␊ |
275 | if self.use_file_vars:␊ |
276 | # Look for emacs-style file variable hints.␊ |
277 | emacs_vars = self._get_emacs_vars(text)␊ |
278 | if "markdown-extras" in emacs_vars:␊ |
279 | splitter = re.compile("[ ,]+")␊ |
280 | for e in splitter.split(emacs_vars["markdown-extras"]):␊ |
281 | if '=' in e:␊ |
282 | ename, earg = e.split('=', 1)␊ |
283 | try:␊ |
284 | earg = int(earg)␊ |
285 | except ValueError:␊ |
286 | pass␊ |
287 | else:␊ |
288 | ename, earg = e, None␊ |
289 | self.extras[ename] = earg␊ |
290 | ␊ |
291 | # Standardize line endings:␊ |
292 | text = re.sub("\r\n|\r", "\n", text)␊ |
293 | ␊ |
294 | # Make sure $text ends with a couple of newlines:␊ |
295 | text += "\n\n"␊ |
296 | ␊ |
297 | # Convert all tabs to spaces.␊ |
298 | text = self._detab(text)␊ |
299 | ␊ |
300 | # Strip any lines consisting only of spaces and tabs.␊ |
301 | # This makes subsequent regexen easier to write, because we can␊ |
302 | # match consecutive blank lines with /\n+/ instead of something␊ |
303 | # contorted like /[ \t]*\n+/ .␊ |
304 | text = self._ws_only_line_re.sub("", text)␊ |
305 | ␊ |
306 | # strip metadata from head and extract␊ |
307 | if "metadata" in self.extras:␊ |
308 | text = self._extract_metadata(text)␊ |
309 | ␊ |
310 | text = self.preprocess(text)␊ |
311 | ␊ |
312 | if self.safe_mode:␊ |
313 | text = self._hash_html_spans(text)␊ |
314 | ␊ |
315 | # Turn block-level HTML blocks into hash entries␊ |
316 | text = self._hash_html_blocks(text, raw=True)␊ |
317 | ␊ |
318 | if "fenced-code-blocks" in self.extras:␊ |
319 | text = self._do_fenced_code_blocks(text)␊ |
320 | ␊ |
321 | # Strip link definitions, store in hashes.␊ |
322 | if "footnotes" in self.extras:␊ |
323 | # Must do footnotes first because an unlucky footnote defn␊ |
324 | # looks like a link defn:␊ |
325 | # [^4]: this "looks like a link defn"␊ |
326 | text = self._strip_footnote_definitions(text)␊ |
327 | text = self._strip_link_definitions(text)␊ |
328 | ␊ |
329 | text = self._run_block_gamut(text)␊ |
330 | ␊ |
331 | if "footnotes" in self.extras:␊ |
332 | text = self._add_footnotes(text)␊ |
333 | ␊ |
334 | text = self.postprocess(text)␊ |
335 | ␊ |
336 | text = self._unescape_special_chars(text)␊ |
337 | ␊ |
338 | if self.safe_mode:␊ |
339 | text = self._unhash_html_spans(text)␊ |
340 | ␊ |
341 | if "nofollow" in self.extras:␊ |
342 | text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text)␊ |
343 | ␊ |
344 | text += "\n"␊ |
345 | ␊ |
346 | rv = UnicodeWithAttrs(text)␊ |
347 | if "toc" in self.extras:␊ |
348 | rv._toc = self._toc␊ |
349 | if "metadata" in self.extras:␊ |
350 | rv.metadata = self.metadata␊ |
351 | return rv␊ |
352 | ␊ |
353 | def postprocess(self, text):␊ |
354 | """A hook for subclasses to do some postprocessing of the html, if␊ |
355 | desired. This is called before unescaping of special chars and␊ |
356 | unhashing of raw HTML spans.␊ |
357 | """␊ |
358 | return text␊ |
359 | ␊ |
360 | def preprocess(self, text):␊ |
361 | """A hook for subclasses to do some preprocessing of the Markdown, if␊ |
362 | desired. This is called after basic formatting of the text, but prior␊ |
363 | to any extras, safe mode, etc. processing.␊ |
364 | """␊ |
365 | return text␊ |
366 | ␊ |
367 | # Is metadata if the content starts with '---'-fenced `key: value`␊ |
368 | # pairs. E.g. (indented for presentation):␊ |
369 | # ---␊ |
370 | # foo: bar␊ |
371 | # another-var: blah blah␊ |
372 | # ---␊ |
373 | _metadata_pat = re.compile("""^---[ \t]*\n((?:[ \t]*[^ \t:]+[ \t]*:[^\n]*\n)+)---[ \t]*\n""")␊ |
374 | ␊ |
375 | def _extract_metadata(self, text):␊ |
376 | # fast test␊ |
377 | if not text.startswith("---"):␊ |
378 | return text␊ |
379 | match = self._metadata_pat.match(text)␊ |
380 | if not match:␊ |
381 | return text␊ |
382 | ␊ |
383 | tail = text[len(match.group(0)):]␊ |
384 | metadata_str = match.group(1).strip()␊ |
385 | for line in metadata_str.split('\n'):␊ |
386 | key, value = line.split(':', 1)␊ |
387 | self.metadata[key.strip()] = value.strip()␊ |
388 | ␊ |
389 | return tail␊ |
390 | ␊ |
391 | ␊ |
392 | _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)␊ |
393 | # This regular expression is intended to match blocks like this:␊ |
394 | # PREFIX Local Variables: SUFFIX␊ |
395 | # PREFIX mode: Tcl SUFFIX␊ |
396 | # PREFIX End: SUFFIX␊ |
397 | # Some notes:␊ |
398 | # - "[ \t]" is used instead of "\s" to specifically exclude newlines␊ |
399 | # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does␊ |
400 | # not like anything other than Unix-style line terminators.␊ |
401 | _emacs_local_vars_pat = re.compile(r"""^␊ |
402 | (?P<prefix>(?:[^\r\n|\n|\r])*?)␊ |
403 | [\ \t]*Local\ Variables:[\ \t]*␊ |
404 | (?P<suffix>.*?)(?:\r\n|\n|\r)␊ |
405 | (?P<content>.*?\1End:)␊ |
406 | """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)␊ |
407 | ␊ |
408 | def _get_emacs_vars(self, text):␊ |
409 | """Return a dictionary of emacs-style local variables.␊ |
410 | ␊ |
411 | Parsing is done loosely according to this spec (and according to␊ |
412 | some in-practice deviations from this):␊ |
413 | http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables␊ |
414 | """␊ |
415 | emacs_vars = {}␊ |
416 | SIZE = pow(2, 13) # 8kB␊ |
417 | ␊ |
418 | # Search near the start for a '-*-'-style one-liner of variables.␊ |
419 | head = text[:SIZE]␊ |
420 | if "-*-" in head:␊ |
421 | match = self._emacs_oneliner_vars_pat.search(head)␊ |
422 | if match:␊ |
423 | emacs_vars_str = match.group(1)␊ |
424 | assert '\n' not in emacs_vars_str␊ |
425 | emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')␊ |
426 | if s.strip()]␊ |
427 | if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:␊ |
428 | # While not in the spec, this form is allowed by emacs:␊ |
429 | # -*- Tcl -*-␊ |
430 | # where the implied "variable" is "mode". This form␊ |
431 | # is only allowed if there are no other variables.␊ |
432 | emacs_vars["mode"] = emacs_var_strs[0].strip()␊ |
433 | else:␊ |
434 | for emacs_var_str in emacs_var_strs:␊ |
435 | try:␊ |
436 | variable, value = emacs_var_str.strip().split(':', 1)␊ |
437 | except ValueError:␊ |
438 | log.debug("emacs variables error: malformed -*- "␊ |
439 | "line: %r", emacs_var_str)␊ |
440 | continue␊ |
441 | # Lowercase the variable name because Emacs allows "Mode"␊ |
442 | # or "mode" or "MoDe", etc.␊ |
443 | emacs_vars[variable.lower()] = value.strip()␊ |
444 | ␊ |
445 | tail = text[-SIZE:]␊ |
446 | if "Local Variables" in tail:␊ |
447 | match = self._emacs_local_vars_pat.search(tail)␊ |
448 | if match:␊ |
449 | prefix = match.group("prefix")␊ |
450 | suffix = match.group("suffix")␊ |
451 | lines = match.group("content").splitlines(0)␊ |
452 | #print "prefix=%r, suffix=%r, content=%r, lines: %s"\␊ |
453 | # % (prefix, suffix, match.group("content"), lines)␊ |
454 | ␊ |
455 | # Validate the Local Variables block: proper prefix and suffix␊ |
456 | # usage.␊ |
457 | for i, line in enumerate(lines):␊ |
458 | if not line.startswith(prefix):␊ |
459 | log.debug("emacs variables error: line '%s' "␊ |
460 | "does not use proper prefix '%s'"␊ |
461 | % (line, prefix))␊ |
462 | return {}␊ |
463 | # Don't validate suffix on last line. Emacs doesn't care,␊ |
464 | # neither should we.␊ |
465 | if i != len(lines)-1 and not line.endswith(suffix):␊ |
466 | log.debug("emacs variables error: line '%s' "␊ |
467 | "does not use proper suffix '%s'"␊ |
468 | % (line, suffix))␊ |
469 | return {}␊ |
470 | ␊ |
471 | # Parse out one emacs var per line.␊ |
472 | continued_for = None␊ |
473 | for line in lines[:-1]: # no var on the last line ("PREFIX End:")␊ |
474 | if prefix: line = line[len(prefix):] # strip prefix␊ |
475 | if suffix: line = line[:-len(suffix)] # strip suffix␊ |
476 | line = line.strip()␊ |
477 | if continued_for:␊ |
478 | variable = continued_for␊ |
479 | if line.endswith('\\'):␊ |
480 | line = line[:-1].rstrip()␊ |
481 | else:␊ |
482 | continued_for = None␊ |
483 | emacs_vars[variable] += ' ' + line␊ |
484 | else:␊ |
485 | try:␊ |
486 | variable, value = line.split(':', 1)␊ |
487 | except ValueError:␊ |
488 | log.debug("local variables error: missing colon "␊ |
489 | "in local variables entry: '%s'" % line)␊ |
490 | continue␊ |
491 | # Do NOT lowercase the variable name, because Emacs only␊ |
492 | # allows "mode" (and not "Mode", "MoDe", etc.) in this block.␊ |
493 | value = value.strip()␊ |
494 | if value.endswith('\\'):␊ |
495 | value = value[:-1].rstrip()␊ |
496 | continued_for = variable␊ |
497 | else:␊ |
498 | continued_for = None␊ |
499 | emacs_vars[variable] = value␊ |
500 | ␊ |
501 | # Unquote values.␊ |
502 | for var, val in list(emacs_vars.items()):␊ |
503 | if len(val) > 1 and (val.startswith('"') and val.endswith('"')␊ |
504 | or val.startswith('"') and val.endswith('"')):␊ |
505 | emacs_vars[var] = val[1:-1]␊ |
506 | ␊ |
507 | return emacs_vars␊ |
508 | ␊ |
509 | # Cribbed from a post by Bart Lateur:␊ |
510 | # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>␊ |
511 | _detab_re = re.compile(r'(.*?)\t', re.M)␊ |
512 | def _detab_sub(self, match):␊ |
513 | g1 = match.group(1)␊ |
514 | return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))␊ |
515 | def _detab(self, text):␊ |
516 | r"""Remove (leading?) tabs from a file.␊ |
517 | ␊ |
518 | >>> m = Markdown()␊ |
519 | >>> m._detab("\tfoo")␊ |
520 | ' foo'␊ |
521 | >>> m._detab(" \tfoo")␊ |
522 | ' foo'␊ |
523 | >>> m._detab("\t foo")␊ |
524 | ' foo'␊ |
525 | >>> m._detab(" foo")␊ |
526 | ' foo'␊ |
527 | >>> m._detab(" foo\n\tbar\tblam")␊ |
528 | ' foo\n bar blam'␊ |
529 | """␊ |
530 | if '\t' not in text:␊ |
531 | return text␊ |
532 | return self._detab_re.subn(self._detab_sub, text)[0]␊ |
533 | ␊ |
534 | # I broke out the html5 tags here and add them to _block_tags_a and␊ |
535 | # _block_tags_b. This way html5 tags are easy to keep track of.␊ |
536 | _html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption'␊ |
537 | ␊ |
538 | _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del'␊ |
539 | _block_tags_a += _html5tags␊ |
540 | ␊ |
541 | _strict_tag_block_re = re.compile(r"""␊ |
542 | ( # save in \1␊ |
543 | ^ # start of line (with re.M)␊ |
544 | <(%s) # start tag = \2␊ |
545 | \b # word break␊ |
546 | (.*\n)*? # any number of lines, minimally matching␊ |
547 | </\2> # the matching end tag␊ |
548 | [ \t]* # trailing spaces/tabs␊ |
549 | (?=\n+|\Z) # followed by a newline or end of document␊ |
550 | )␊ |
551 | """ % _block_tags_a,␊ |
552 | re.X | re.M)␊ |
553 | ␊ |
554 | _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math'␊ |
555 | _block_tags_b += _html5tags␊ |
556 | ␊ |
557 | _liberal_tag_block_re = re.compile(r"""␊ |
558 | ( # save in \1␊ |
559 | ^ # start of line (with re.M)␊ |
560 | <(%s) # start tag = \2␊ |
561 | \b # word break␊ |
562 | (.*\n)*? # any number of lines, minimally matching␊ |
563 | .*</\2> # the matching end tag␊ |
564 | [ \t]* # trailing spaces/tabs␊ |
565 | (?=\n+|\Z) # followed by a newline or end of document␊ |
566 | )␊ |
567 | """ % _block_tags_b,␊ |
568 | re.X | re.M)␊ |
569 | ␊ |
570 | _html_markdown_attr_re = re.compile(␊ |
571 | r'''\s+markdown=("1"|'1')''')␊ |
572 | def _hash_html_block_sub(self, match, raw=False):␊ |
573 | html = match.group(1)␊ |
574 | if raw and self.safe_mode:␊ |
575 | html = self._sanitize_html(html)␊ |
576 | elif 'markdown-in-html' in self.extras and 'markdown=' in html:␊ |
577 | first_line = html.split('\n', 1)[0]␊ |
578 | m = self._html_markdown_attr_re.search(first_line)␊ |
579 | if m:␊ |
580 | lines = html.split('\n')␊ |
581 | middle = '\n'.join(lines[1:-1])␊ |
582 | last_line = lines[-1]␊ |
583 | first_line = first_line[:m.start()] + first_line[m.end():]␊ |
584 | f_key = _hash_text(first_line)␊ |
585 | self.html_blocks[f_key] = first_line␊ |
586 | l_key = _hash_text(last_line)␊ |
587 | self.html_blocks[l_key] = last_line␊ |
588 | return ''.join(["\n\n", f_key,␊ |
589 | "\n\n", middle, "\n\n",␊ |
590 | l_key, "\n\n"])␊ |
591 | key = _hash_text(html)␊ |
592 | self.html_blocks[key] = html␊ |
593 | return "\n\n" + key + "\n\n"␊ |
594 | ␊ |
595 | def _hash_html_blocks(self, text, raw=False):␊ |
596 | """Hashify HTML blocks␊ |
597 | ␊ |
598 | We only want to do this for block-level HTML tags, such as headers,␊ |
599 | lists, and tables. That's because we still want to wrap <p>s around␊ |
600 | "paragraphs" that are wrapped in non-block-level tags, such as anchors,␊ |
601 | phrase emphasis, and spans. The list of tags we're looking for is␊ |
602 | hard-coded.␊ |
603 | ␊ |
604 | @param raw {boolean} indicates if these are raw HTML blocks in␊ |
605 | the original source. It makes a difference in "safe" mode.␊ |
606 | """␊ |
607 | if '<' not in text:␊ |
608 | return text␊ |
609 | ␊ |
610 | # Pass `raw` value into our calls to self._hash_html_block_sub.␊ |
611 | hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)␊ |
612 | ␊ |
613 | # First, look for nested blocks, e.g.:␊ |
614 | # <div>␊ |
615 | # <div>␊ |
616 | # tags for inner block must be indented.␊ |
617 | # </div>␊ |
618 | # </div>␊ |
619 | #␊ |
620 | # The outermost tags must start at the left margin for this to match, and␊ |
621 | # the inner nested divs must be indented.␊ |
622 | # We need to do this before the next, more liberal match, because the next␊ |
623 | # match will start at the first `<div>` and stop at the first `</div>`.␊ |
624 | text = self._strict_tag_block_re.sub(hash_html_block_sub, text)␊ |
625 | ␊ |
626 | # Now match more liberally, simply from `\n<tag>` to `</tag>\n`␊ |
627 | text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)␊ |
628 | ␊ |
629 | # Special case just for <hr />. It was easier to make a special␊ |
630 | # case than to make the other regex more complicated.␊ |
631 | if "<hr" in text:␊ |
632 | _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)␊ |
633 | text = _hr_tag_re.sub(hash_html_block_sub, text)␊ |
634 | ␊ |
635 | # Special case for standalone HTML comments:␊ |
636 | if "<!--" in text:␊ |
637 | start = 0␊ |
638 | while True:␊ |
639 | # Delimiters for next comment block.␊ |
640 | try:␊ |
641 | start_idx = text.index("<!--", start)␊ |
642 | except ValueError:␊ |
643 | break␊ |
644 | try:␊ |
645 | end_idx = text.index("-->", start_idx) + 3␊ |
646 | except ValueError:␊ |
647 | break␊ |
648 | ␊ |
649 | # Start position for next comment block search.␊ |
650 | start = end_idx␊ |
651 | ␊ |
652 | # Validate whitespace before comment.␊ |
653 | if start_idx:␊ |
654 | # - Up to `tab_width - 1` spaces before start_idx.␊ |
655 | for i in range(self.tab_width - 1):␊ |
656 | if text[start_idx - 1] != ' ':␊ |
657 | break␊ |
658 | start_idx -= 1␊ |
659 | if start_idx == 0:␊ |
660 | break␊ |
661 | # - Must be preceded by 2 newlines or hit the start of␊ |
662 | # the document.␊ |
663 | if start_idx == 0:␊ |
664 | pass␊ |
665 | elif start_idx == 1 and text[0] == '\n':␊ |
666 | start_idx = 0 # to match minute detail of Markdown.pl regex␊ |
667 | elif text[start_idx-2:start_idx] == '\n\n':␊ |
668 | pass␊ |
669 | else:␊ |
670 | break␊ |
671 | ␊ |
672 | # Validate whitespace after comment.␊ |
673 | # - Any number of spaces and tabs.␊ |
674 | while end_idx < len(text):␊ |
675 | if text[end_idx] not in ' \t':␊ |
676 | break␊ |
677 | end_idx += 1␊ |
678 | # - Must be following by 2 newlines or hit end of text.␊ |
679 | if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'):␊ |
680 | continue␊ |
681 | ␊ |
682 | # Escape and hash (must match `_hash_html_block_sub`).␊ |
683 | html = text[start_idx:end_idx]␊ |
684 | if raw and self.safe_mode:␊ |
685 | html = self._sanitize_html(html)␊ |
686 | key = _hash_text(html)␊ |
687 | self.html_blocks[key] = html␊ |
688 | text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:]␊ |
689 | ␊ |
690 | if "xml" in self.extras:␊ |
691 | # Treat XML processing instructions and namespaced one-liner␊ |
692 | # tags as if they were block HTML tags. E.g., if standalone␊ |
693 | # (i.e. are their own paragraph), the following do not get␊ |
694 | # wrapped in a <p> tag:␊ |
695 | # <?foo bar?>␊ |
696 | #␊ |
697 | # <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>␊ |
698 | _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)␊ |
699 | text = _xml_oneliner_re.sub(hash_html_block_sub, text)␊ |
700 | ␊ |
701 | return text␊ |
702 | ␊ |
703 | def _strip_link_definitions(self, text):␊ |
704 | # Strips link definitions from text, stores the URLs and titles in␊ |
705 | # hash references.␊ |
706 | less_than_tab = self.tab_width - 1␊ |
707 | ␊ |
708 | # Link defs are in the form:␊ |
709 | # [id]: url "optional title"␊ |
710 | _link_def_re = re.compile(r"""␊ |
711 | ^[ ]{0,%d}\[(.+)\]: # id = \1␊ |
712 | [ \t]*␊ |
713 | \n? # maybe *one* newline␊ |
714 | [ \t]*␊ |
715 | <?(.+?)>? # url = \2␊ |
716 | [ \t]*␊ |
717 | (?:␊ |
718 | \n? # maybe one newline␊ |
719 | [ \t]*␊ |
720 | (?<=\s) # lookbehind for whitespace␊ |
721 | ['"(]␊ |
722 | ([^\n]*) # title = \3␊ |
723 | ['")]␊ |
724 | [ \t]*␊ |
725 | )? # title is optional␊ |
726 | (?:\n+|\Z)␊ |
727 | """ % less_than_tab, re.X | re.M | re.U)␊ |
728 | return _link_def_re.sub(self._extract_link_def_sub, text)␊ |
729 | ␊ |
730 | def _extract_link_def_sub(self, match):␊ |
731 | id, url, title = match.groups()␊ |
732 | key = id.lower() # Link IDs are case-insensitive␊ |
733 | self.urls[key] = self._encode_amps_and_angles(url)␊ |
734 | if title:␊ |
735 | self.titles[key] = title␊ |
736 | return ""␊ |
737 | ␊ |
738 | def _extract_footnote_def_sub(self, match):␊ |
739 | id, text = match.groups()␊ |
740 | text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()␊ |
741 | normed_id = re.sub(r'\W', '-', id)␊ |
742 | # Ensure footnote text ends with a couple newlines (for some␊ |
743 | # block gamut matches).␊ |
744 | self.footnotes[normed_id] = text + "\n\n"␊ |
745 | return ""␊ |
746 | ␊ |
747 | def _strip_footnote_definitions(self, text):␊ |
748 | """A footnote definition looks like this:␊ |
749 | ␊ |
750 | [^note-id]: Text of the note.␊ |
751 | ␊ |
752 | May include one or more indented paragraphs.␊ |
753 | ␊ |
754 | Where,␊ |
755 | - The 'note-id' can be pretty much anything, though typically it␊ |
756 | is the number of the footnote.␊ |
757 | - The first paragraph may start on the next line, like so:␊ |
758 | ␊ |
759 | [^note-id]:␊ |
760 | Text of the note.␊ |
761 | """␊ |
762 | less_than_tab = self.tab_width - 1␊ |
763 | footnote_def_re = re.compile(r'''␊ |
764 | ^[ ]{0,%d}\[\^(.+)\]: # id = \1␊ |
765 | [ \t]*␊ |
766 | ( # footnote text = \2␊ |
767 | # First line need not start with the spaces.␊ |
768 | (?:\s*.*\n+)␊ |
769 | (?:␊ |
770 | (?:[ ]{%d} | \t) # Subsequent lines must be indented.␊ |
771 | .*\n+␊ |
772 | )*␊ |
773 | )␊ |
774 | # Lookahead for non-space at line-start, or end of doc.␊ |
775 | (?:(?=^[ ]{0,%d}\S)|\Z)␊ |
776 | ''' % (less_than_tab, self.tab_width, self.tab_width),␊ |
777 | re.X | re.M)␊ |
778 | return footnote_def_re.sub(self._extract_footnote_def_sub, text)␊ |
779 | ␊ |
780 | ␊ |
781 | _hr_data = [␊ |
782 | ('*', re.compile(r"^[ ]{0,3}\*(.*?)$", re.M)),␊ |
783 | ('-', re.compile(r"^[ ]{0,3}\-(.*?)$", re.M)),␊ |
784 | ('_', re.compile(r"^[ ]{0,3}\_(.*?)$", re.M)),␊ |
785 | ]␊ |
786 | ␊ |
787 | def _run_block_gamut(self, text):␊ |
788 | # These are all the transformations that form block-level␊ |
789 | # tags like paragraphs, headers, and list items.␊ |
790 | ␊ |
791 | if "fenced-code-blocks" in self.extras:␊ |
792 | text = self._do_fenced_code_blocks(text)␊ |
793 | ␊ |
794 | text = self._do_headers(text)␊ |
795 | ␊ |
796 | # Do Horizontal Rules:␊ |
797 | # On the number of spaces in horizontal rules: The spec is fuzzy: "If␊ |
798 | # you wish, you may use spaces between the hyphens or asterisks."␊ |
799 | # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the␊ |
800 | # hr chars to one or two. We'll reproduce that limit here.␊ |
801 | hr = "\n<hr"+self.empty_element_suffix+"\n"␊ |
802 | for ch, regex in self._hr_data:␊ |
803 | if ch in text:␊ |
804 | for m in reversed(list(regex.finditer(text))):␊ |
805 | tail = m.group(1).rstrip()␊ |
806 | if not tail.strip(ch + ' ') and tail.count(" ") == 0:␊ |
807 | start, end = m.span()␊ |
808 | text = text[:start] + hr + text[end:]␊ |
809 | ␊ |
810 | text = self._do_lists(text)␊ |
811 | ␊ |
812 | if "pyshell" in self.extras:␊ |
813 | text = self._prepare_pyshell_blocks(text)␊ |
814 | if "wiki-tables" in self.extras:␊ |
815 | text = self._do_wiki_tables(text)␊ |
816 | ␊ |
817 | text = self._do_code_blocks(text)␊ |
818 | ␊ |
819 | text = self._do_block_quotes(text)␊ |
820 | ␊ |
821 | # We already ran _HashHTMLBlocks() before, in Markdown(), but that␊ |
822 | # was to escape raw HTML in the original Markdown source. This time,␊ |
823 | # we're escaping the markup we've just created, so that we don't wrap␊ |
824 | # <p> tags around block-level tags.␊ |
825 | text = self._hash_html_blocks(text)␊ |
826 | ␊ |
827 | text = self._form_paragraphs(text)␊ |
828 | ␊ |
829 | return text␊ |
830 | ␊ |
831 | def _pyshell_block_sub(self, match):␊ |
832 | lines = match.group(0).splitlines(0)␊ |
833 | _dedentlines(lines)␊ |
834 | indent = ' ' * self.tab_width␊ |
835 | s = ('\n' # separate from possible cuddled paragraph␊ |
836 | + indent + ('\n'+indent).join(lines)␊ |
837 | + '\n\n')␊ |
838 | return s␊ |
839 | ␊ |
840 | def _prepare_pyshell_blocks(self, text):␊ |
841 | """Ensure that Python interactive shell sessions are put in␊ |
842 | code blocks -- even if not properly indented.␊ |
843 | """␊ |
844 | if ">>>" not in text:␊ |
845 | return text␊ |
846 | ␊ |
847 | less_than_tab = self.tab_width - 1␊ |
848 | _pyshell_block_re = re.compile(r"""␊ |
849 | ^([ ]{0,%d})>>>[ ].*\n # first line␊ |
850 | ^(\1.*\S+.*\n)* # any number of subsequent lines␊ |
851 | ^\n # ends with a blank line␊ |
852 | """ % less_than_tab, re.M | re.X)␊ |
853 | ␊ |
854 | return _pyshell_block_re.sub(self._pyshell_block_sub, text)␊ |
855 | ␊ |
856 | def _wiki_table_sub(self, match):␊ |
857 | ttext = match.group(0).strip()␊ |
858 | #print 'wiki table: %r' % match.group(0)␊ |
859 | rows = []␊ |
860 | for line in ttext.splitlines(0):␊ |
861 | line = line.strip()[2:-2].strip()␊ |
862 | row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]␊ |
863 | rows.append(row)␊ |
864 | #pprint(rows)␊ |
865 | hlines = ['<table>', '<tbody>']␊ |
866 | for row in rows:␊ |
867 | hrow = ['<tr>']␊ |
868 | for cell in row:␊ |
869 | hrow.append('<td>')␊ |
870 | hrow.append(self._run_span_gamut(cell))␊ |
871 | hrow.append('</td>')␊ |
872 | hrow.append('</tr>')␊ |
873 | hlines.append(''.join(hrow))␊ |
874 | hlines += ['</tbody>', '</table>']␊ |
875 | return '\n'.join(hlines) + '\n'␊ |
876 | ␊ |
877 | def _do_wiki_tables(self, text):␊ |
878 | # Optimization.␊ |
879 | if "||" not in text:␊ |
880 | return text␊ |
881 | ␊ |
882 | less_than_tab = self.tab_width - 1␊ |
883 | wiki_table_re = re.compile(r'''␊ |
884 | (?:(?<=\n\n)|\A\n?) # leading blank line␊ |
885 | ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line␊ |
886 | (^\1\|\|.+?\|\|\n)* # any number of subsequent lines␊ |
887 | ''' % less_than_tab, re.M | re.X)␊ |
888 | return wiki_table_re.sub(self._wiki_table_sub, text)␊ |
889 | ␊ |
890 | def _run_span_gamut(self, text):␊ |
891 | # These are all the transformations that occur *within* block-level␊ |
892 | # tags like paragraphs, headers, and list items.␊ |
893 | ␊ |
894 | text = self._do_code_spans(text)␊ |
895 | ␊ |
896 | text = self._escape_special_chars(text)␊ |
897 | ␊ |
898 | # Process anchor and image tags.␊ |
899 | text = self._do_links(text)␊ |
900 | ␊ |
901 | # Make links out of things like `<http://example.com/>`␊ |
902 | # Must come after _do_links(), because you can use < and >␊ |
903 | # delimiters in inline links like [this](<url>).␊ |
904 | text = self._do_auto_links(text)␊ |
905 | ␊ |
906 | if "link-patterns" in self.extras:␊ |
907 | text = self._do_link_patterns(text)␊ |
908 | ␊ |
909 | text = self._encode_amps_and_angles(text)␊ |
910 | ␊ |
911 | text = self._do_italics_and_bold(text)␊ |
912 | ␊ |
913 | if "smarty-pants" in self.extras:␊ |
914 | text = self._do_smart_punctuation(text)␊ |
915 | ␊ |
916 | # Do hard breaks:␊ |
917 | text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)␊ |
918 | ␊ |
919 | return text␊ |
920 | ␊ |
921 | # "Sorta" because auto-links are identified as "tag" tokens.␊ |
922 | _sorta_html_tokenize_re = re.compile(r"""␊ |
923 | (␊ |
924 | # tag␊ |
925 | </?␊ |
926 | (?:\w+) # tag name␊ |
927 | (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes␊ |
928 | \s*/?>␊ |
929 | |␊ |
930 | # auto-link (e.g., <http://www.activestate.com/>)␊ |
931 | <\w+[^>]*>␊ |
932 | |␊ |
933 | <!--.*?--> # comment␊ |
934 | |␊ |
935 | <\?.*?\?> # processing instruction␊ |
936 | )␊ |
937 | """, re.X)␊ |
938 | ␊ |
939 | def _escape_special_chars(self, text):␊ |
940 | # Python markdown note: the HTML tokenization here differs from␊ |
941 | # that in Markdown.pl, hence the behaviour for subtle cases can␊ |
942 | # differ (I believe the tokenizer here does a better job because␊ |
943 | # it isn't susceptible to unmatched '<' and '>' in HTML tags).␊ |
944 | # Note, however, that '>' is not allowed in an auto-link URL␊ |
945 | # here.␊ |
946 | escaped = []␊ |
947 | is_html_markup = False␊ |
948 | for token in self._sorta_html_tokenize_re.split(text):␊ |
949 | if is_html_markup:␊ |
950 | # Within tags/HTML-comments/auto-links, encode * and _␊ |
951 | # so they don't conflict with their use in Markdown for␊ |
952 | # italics and strong. We're replacing each such␊ |
953 | # character with its corresponding MD5 checksum value;␊ |
954 | # this is likely overkill, but it should prevent us from␊ |
955 | # colliding with the escape values by accident.␊ |
956 | escaped.append(token.replace('*', self._escape_table['*'])␊ |
957 | .replace('_', self._escape_table['_']))␊ |
958 | else:␊ |
959 | escaped.append(self._encode_backslash_escapes(token))␊ |
960 | is_html_markup = not is_html_markup␊ |
961 | return ''.join(escaped)␊ |
962 | ␊ |
963 | def _hash_html_spans(self, text):␊ |
964 | # Used for safe_mode.␊ |
965 | ␊ |
966 | def _is_auto_link(s):␊ |
967 | if ':' in s and self._auto_link_re.match(s):␊ |
968 | return True␊ |
969 | elif '@' in s and self._auto_email_link_re.match(s):␊ |
970 | return True␊ |
971 | return False␊ |
972 | ␊ |
973 | tokens = []␊ |
974 | is_html_markup = False␊ |
975 | for token in self._sorta_html_tokenize_re.split(text):␊ |
976 | if is_html_markup and not _is_auto_link(token):␊ |
977 | sanitized = self._sanitize_html(token)␊ |
978 | key = _hash_text(sanitized)␊ |
979 | self.html_spans[key] = sanitized␊ |
980 | tokens.append(key)␊ |
981 | else:␊ |
982 | tokens.append(token)␊ |
983 | is_html_markup = not is_html_markup␊ |
984 | return ''.join(tokens)␊ |
985 | ␊ |
986 | def _unhash_html_spans(self, text):␊ |
987 | for key, sanitized in list(self.html_spans.items()):␊ |
988 | text = text.replace(key, sanitized)␊ |
989 | return text␊ |
990 | ␊ |
991 | def _sanitize_html(self, s):␊ |
992 | if self.safe_mode == "replace":␊ |
993 | return self.html_removed_text␊ |
994 | elif self.safe_mode == "escape":␊ |
995 | replacements = [␊ |
996 | ('&', '&'),␊ |
997 | ('<', '<'),␊ |
998 | ('>', '>'),␊ |
999 | ]␊ |
1000 | for before, after in replacements:␊ |
1001 | s = s.replace(before, after)␊ |
1002 | return s␊ |
1003 | else:␊ |
1004 | raise MarkdownError("invalid value for 'safe_mode': %r (must be "␊ |
1005 | "'escape' or 'replace')" % self.safe_mode)␊ |
1006 | ␊ |
1007 | _tail_of_inline_link_re = re.compile(r'''␊ |
1008 | # Match tail of: [text](/url/) or [text](/url/ "title")␊ |
1009 | \( # literal paren␊ |
1010 | [ \t]*␊ |
1011 | (?P<url> # \1␊ |
1012 | <.*?>␊ |
1013 | |␊ |
1014 | .*?␊ |
1015 | )␊ |
1016 | [ \t]*␊ |
1017 | ( # \2␊ |
1018 | (['"]) # quote char = \3␊ |
1019 | (?P<title>.*?)␊ |
1020 | \3 # matching quote␊ |
1021 | )? # title is optional␊ |
1022 | \)␊ |
1023 | ''', re.X | re.S)␊ |
1024 | _tail_of_reference_link_re = re.compile(r'''␊ |
1025 | # Match tail of: [text][id]␊ |
1026 | [ ]? # one optional space␊ |
1027 | (?:\n[ ]*)? # one optional newline followed by spaces␊ |
1028 | \[␊ |
1029 | (?P<id>.*?)␊ |
1030 | \]␊ |
1031 | ''', re.X | re.S)␊ |
1032 | ␊ |
1033 | def _do_links(self, text):␊ |
1034 | """Turn Markdown link shortcuts into XHTML <a> and <img> tags.␊ |
1035 | ␊ |
1036 | This is a combination of Markdown.pl's _DoAnchors() and␊ |
1037 | _DoImages(). They are done together because that simplified the␊ |
1038 | approach. It was necessary to use a different approach than␊ |
1039 | Markdown.pl because of the lack of atomic matching support in␊ |
1040 | Python's regex engine used in $g_nested_brackets.␊ |
1041 | """␊ |
1042 | MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24␊ |
1043 | ␊ |
1044 | # `anchor_allowed_pos` is used to support img links inside␊ |
1045 | # anchors, but not anchors inside anchors. An anchor's start␊ |
1046 | # pos must be `>= anchor_allowed_pos`.␊ |
1047 | anchor_allowed_pos = 0␊ |
1048 | ␊ |
1049 | curr_pos = 0␊ |
1050 | while True: # Handle the next link.␊ |
1051 | # The next '[' is the start of:␊ |
1052 | # - an inline anchor: [text](url "title")␊ |
1053 | # - a reference anchor: [text][id]␊ |
1054 | # - an inline img: ␊ |
1055 | # - a reference img: ![text][id]␊ |
1056 | # - a footnote ref: [^id]␊ |
1057 | # (Only if 'footnotes' extra enabled)␊ |
1058 | # - a footnote defn: [^id]: ...␊ |
1059 | # (Only if 'footnotes' extra enabled) These have already␊ |
1060 | # been stripped in _strip_footnote_definitions() so no␊ |
1061 | # need to watch for them.␊ |
1062 | # - a link definition: [id]: url "title"␊ |
1063 | # These have already been stripped in␊ |
1064 | # _strip_link_definitions() so no need to watch for them.␊ |
1065 | # - not markup: [...anything else...␊ |
1066 | try:␊ |
1067 | start_idx = text.index('[', curr_pos)␊ |
1068 | except ValueError:␊ |
1069 | break␊ |
1070 | text_length = len(text)␊ |
1071 | ␊ |
1072 | # Find the matching closing ']'.␊ |
1073 | # Markdown.pl allows *matching* brackets in link text so we␊ |
1074 | # will here too. Markdown.pl *doesn't* currently allow␊ |
1075 | # matching brackets in img alt text -- we'll differ in that␊ |
1076 | # regard.␊ |
1077 | bracket_depth = 0␊ |
1078 | for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,␊ |
1079 | text_length)):␊ |
1080 | ch = text[p]␊ |
1081 | if ch == ']':␊ |
1082 | bracket_depth -= 1␊ |
1083 | if bracket_depth < 0:␊ |
1084 | break␊ |
1085 | elif ch == '[':␊ |
1086 | bracket_depth += 1␊ |
1087 | else:␊ |
1088 | # Closing bracket not found within sentinel length.␊ |
1089 | # This isn't markup.␊ |
1090 | curr_pos = start_idx + 1␊ |
1091 | continue␊ |
1092 | link_text = text[start_idx+1:p]␊ |
1093 | ␊ |
1094 | # Possibly a footnote ref?␊ |
1095 | if "footnotes" in self.extras and link_text.startswith("^"):␊ |
1096 | normed_id = re.sub(r'\W', '-', link_text[1:])␊ |
1097 | if normed_id in self.footnotes:␊ |
1098 | self.footnote_ids.append(normed_id)␊ |
1099 | result = '<sup class="footnote-ref" id="fnref-%s">' \␊ |
1100 | '<a href="#fn-%s">%s</a></sup>' \␊ |
1101 | % (normed_id, normed_id, len(self.footnote_ids))␊ |
1102 | text = text[:start_idx] + result + text[p+1:]␊ |
1103 | else:␊ |
1104 | # This id isn't defined, leave the markup alone.␊ |
1105 | curr_pos = p+1␊ |
1106 | continue␊ |
1107 | ␊ |
1108 | # Now determine what this is by the remainder.␊ |
1109 | p += 1␊ |
1110 | if p == text_length:␊ |
1111 | return text␊ |
1112 | ␊ |
1113 | # Inline anchor or img?␊ |
1114 | if text[p] == '(': # attempt at perf improvement␊ |
1115 | match = self._tail_of_inline_link_re.match(text, p)␊ |
1116 | if match:␊ |
1117 | # Handle an inline anchor or img.␊ |
1118 | is_img = start_idx > 0 and text[start_idx-1] == "!"␊ |
1119 | if is_img:␊ |
1120 | start_idx -= 1␊ |
1121 | ␊ |
1122 | ␉␉ is_inline_img = start_idx > 0 and text[start_idx-1] == "#"␊ |
1123 | ␉␉ if is_inline_img:␊ |
1124 | ␉␉␉ start_idx -= 1␊ |
1125 | ␉␉␉ is_img = 1␊ |
1126 | ␊ |
1127 | url, title = match.group("url"), match.group("title")␊ |
1128 | if url and url[0] == '<':␊ |
1129 | url = url[1:-1] # '<url>' -> 'url'␊ |
1130 | # We've got to encode these to avoid conflicting␊ |
1131 | # with italics/bold.␊ |
1132 | url = url.replace('*', self._escape_table['*']) \␊ |
1133 | .replace('_', self._escape_table['_'])␊ |
1134 | if title:␊ |
1135 | title_str = ' title="%s"' % (␊ |
1136 | _xml_escape_attr(title)␊ |
1137 | .replace('*', self._escape_table['*'])␊ |
1138 | .replace('_', self._escape_table['_']))␊ |
1139 | else:␊ |
1140 | title_str = ''␊ |
1141 | if is_img:␊ |
1142 | if is_inline_img:␊ |
1143 | ␉␉␉␉result = '<img class="inlineimage" src="%s" alt="%s"%s%s' \␊ |
1144 | ␉␉␉␉ % (url.replace('"', '"'),␊ |
1145 | ␉␉␉␉ _xml_escape_attr(link_text),␊ |
1146 | ␉␉␉␉ title_str, self.empty_element_suffix)␊ |
1147 | else:␊ |
1148 | result = '<img src="%s" alt="%s"%s%s' \␊ |
1149 | ␉␉␉␉% (url.replace('"', '"'),␊ |
1150 | ␉␉␉␉ _xml_escape_attr(link_text),␊ |
1151 | ␉␉␉␉ title_str, self.empty_element_suffix)␊ |
1152 | if "smarty-pants" in self.extras:␊ |
1153 | result = result.replace('"', self._escape_table['"'])␊ |
1154 | curr_pos = start_idx + len(result)␊ |
1155 | text = text[:start_idx] + result + text[match.end():]␊ |
1156 | elif start_idx >= anchor_allowed_pos:␊ |
1157 | result_head = '<a href="%s"%s>' % (url, title_str)␊ |
1158 | result = '%s%s</a>' % (result_head, link_text)␊ |
1159 | if "smarty-pants" in self.extras:␊ |
1160 | result = result.replace('"', self._escape_table['"'])␊ |
1161 | # <img> allowed from curr_pos on, <a> from␊ |
1162 | # anchor_allowed_pos on.␊ |
1163 | curr_pos = start_idx + len(result_head)␊ |
1164 | anchor_allowed_pos = start_idx + len(result)␊ |
1165 | text = text[:start_idx] + result + text[match.end():]␊ |
1166 | else:␊ |
1167 | # Anchor not allowed here.␊ |
1168 | curr_pos = start_idx + 1␊ |
1169 | continue␊ |
1170 | ␊ |
1171 | # Reference anchor or img?␊ |
1172 | else:␊ |
1173 | match = self._tail_of_reference_link_re.match(text, p)␊ |
1174 | if match:␊ |
1175 | # Handle a reference-style anchor or img.␊ |
1176 | is_img = start_idx > 0 and text[start_idx-1] == "!"␊ |
1177 | if is_img:␊ |
1178 | start_idx -= 1␊ |
1179 | link_id = match.group("id").lower()␊ |
1180 | if not link_id:␊ |
1181 | link_id = link_text.lower() # for links like [this][]␊ |
1182 | if link_id in self.urls:␊ |
1183 | url = self.urls[link_id]␊ |
1184 | # We've got to encode these to avoid conflicting␊ |
1185 | # with italics/bold.␊ |
1186 | url = url.replace('*', self._escape_table['*']) \␊ |
1187 | .replace('_', self._escape_table['_'])␊ |
1188 | title = self.titles.get(link_id)␊ |
1189 | if title:␊ |
1190 | before = title␊ |
1191 | title = _xml_escape_attr(title) \␊ |
1192 | .replace('*', self._escape_table['*']) \␊ |
1193 | .replace('_', self._escape_table['_'])␊ |
1194 | title_str = ' title="%s"' % title␊ |
1195 | else:␊ |
1196 | title_str = ''␊ |
1197 | if is_img:␊ |
1198 | result = '<img src="%s" alt="%s"%s%s' \␊ |
1199 | % (url.replace('"', '"'),␊ |
1200 | link_text.replace('"', '"'),␊ |
1201 | title_str, self.empty_element_suffix)␊ |
1202 | if "smarty-pants" in self.extras:␊ |
1203 | result = result.replace('"', self._escape_table['"'])␊ |
1204 | curr_pos = start_idx + len(result)␊ |
1205 | text = text[:start_idx] + result + text[match.end():]␊ |
1206 | elif start_idx >= anchor_allowed_pos:␊ |
1207 | result = '<a href="%s"%s>%s</a>' \␊ |
1208 | % (url, title_str, link_text)␊ |
1209 | result_head = '<a href="%s"%s>' % (url, title_str)␊ |
1210 | result = '%s%s</a>' % (result_head, link_text)␊ |
1211 | if "smarty-pants" in self.extras:␊ |
1212 | result = result.replace('"', self._escape_table['"'])␊ |
1213 | # <img> allowed from curr_pos on, <a> from␊ |
1214 | # anchor_allowed_pos on.␊ |
1215 | curr_pos = start_idx + len(result_head)␊ |
1216 | anchor_allowed_pos = start_idx + len(result)␊ |
1217 | text = text[:start_idx] + result + text[match.end():]␊ |
1218 | else:␊ |
1219 | # Anchor not allowed here.␊ |
1220 | curr_pos = start_idx + 1␊ |
1221 | else:␊ |
1222 | # This id isn't defined, leave the markup alone.␊ |
1223 | curr_pos = match.end()␊ |
1224 | continue␊ |
1225 | ␊ |
1226 | # Otherwise, it isn't markup.␊ |
1227 | curr_pos = start_idx + 1␊ |
1228 | ␊ |
1229 | return text␊ |
1230 | ␊ |
1231 | def header_id_from_text(self, text, prefix, n):␊ |
1232 | """Generate a header id attribute value from the given header␊ |
1233 | HTML content.␊ |
1234 | ␊ |
1235 | This is only called if the "header-ids" extra is enabled.␊ |
1236 | Subclasses may override this for different header ids.␊ |
1237 | ␊ |
1238 | @param text {str} The text of the header tag␊ |
1239 | @param prefix {str} The requested prefix for header ids. This is the␊ |
1240 | value of the "header-ids" extra key, if any. Otherwise, None.␊ |
1241 | @param n {int} The <hN> tag number, i.e. `1` for an <h1> tag.␊ |
1242 | @returns {str} The value for the header tag's "id" attribute. Return␊ |
1243 | None to not have an id attribute and to exclude this header from␊ |
1244 | the TOC (if the "toc" extra is specified).␊ |
1245 | """␊ |
1246 | header_id = _slugify(text)␊ |
1247 | if prefix and isinstance(prefix, base_string_type):␊ |
1248 | header_id = prefix + '-' + header_id␊ |
1249 | if header_id in self._count_from_header_id:␊ |
1250 | self._count_from_header_id[header_id] += 1␊ |
1251 | header_id += '-%s' % self._count_from_header_id[header_id]␊ |
1252 | else:␊ |
1253 | self._count_from_header_id[header_id] = 1␊ |
1254 | return header_id␊ |
1255 | ␊ |
1256 | _toc = None␊ |
1257 | def _toc_add_entry(self, level, id, name):␊ |
1258 | if self._toc is None:␊ |
1259 | self._toc = []␊ |
1260 | self._toc.append((level, id, self._unescape_special_chars(name)))␊ |
1261 | ␊ |
1262 | _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)␊ |
1263 | def _setext_h_sub(self, match):␊ |
1264 | n = {"=": 1, "-": 2}[match.group(2)[0]]␊ |
1265 | demote_headers = self.extras.get("demote-headers")␊ |
1266 | if demote_headers:␊ |
1267 | n = min(n + demote_headers, 6)␊ |
1268 | header_id_attr = ""␊ |
1269 | if "header-ids" in self.extras:␊ |
1270 | header_id = self.header_id_from_text(match.group(1),␊ |
1271 | self.extras["header-ids"], n)␊ |
1272 | if header_id:␊ |
1273 | header_id_attr = ' id="%s"' % header_id␊ |
1274 | html = self._run_span_gamut(match.group(1))␊ |
1275 | if "toc" in self.extras and header_id:␊ |
1276 | self._toc_add_entry(n, header_id, html)␊ |
1277 | return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)␊ |
1278 | ␊ |
1279 | _atx_h_re = re.compile(r'''␊ |
1280 | ^(\#{1,6}) # \1 = string of #'s␊ |
1281 | [ \t]+␊ |
1282 | (.+?) # \2 = Header text␊ |
1283 | [ \t]*␊ |
1284 | (?<!\\) # ensure not an escaped trailing '#'␊ |
1285 | \#* # optional closing #'s (not counted)␊ |
1286 | \n+␊ |
1287 | ''', re.X | re.M)␊ |
1288 | def _atx_h_sub(self, match):␊ |
1289 | n = len(match.group(1))␊ |
1290 | demote_headers = self.extras.get("demote-headers")␊ |
1291 | if demote_headers:␊ |
1292 | n = min(n + demote_headers, 6)␊ |
1293 | header_id_attr = ""␊ |
1294 | if "header-ids" in self.extras:␊ |
1295 | header_id = self.header_id_from_text(match.group(2),␊ |
1296 | self.extras["header-ids"], n)␊ |
1297 | if header_id:␊ |
1298 | header_id_attr = ' id="%s"' % header_id␊ |
1299 | html = self._run_span_gamut(match.group(2))␊ |
1300 | if "toc" in self.extras and header_id:␊ |
1301 | self._toc_add_entry(n, header_id, html)␊ |
1302 | return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)␊ |
1303 | ␊ |
1304 | def _do_headers(self, text):␊ |
1305 | # Setext-style headers:␊ |
1306 | # Header 1␊ |
1307 | # ========␊ |
1308 | #␊ |
1309 | # Header 2␊ |
1310 | # --------␊ |
1311 | text = self._setext_h_re.sub(self._setext_h_sub, text)␊ |
1312 | ␊ |
1313 | # atx-style headers:␊ |
1314 | # # Header 1␊ |
1315 | # ## Header 2␊ |
1316 | # ## Header 2 with closing hashes ##␊ |
1317 | # ...␊ |
1318 | # ###### Header 6␊ |
1319 | text = self._atx_h_re.sub(self._atx_h_sub, text)␊ |
1320 | ␊ |
1321 | return text␊ |
1322 | ␊ |
1323 | ␊ |
1324 | _marker_ul_chars = '*+-'␊ |
1325 | _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars␊ |
1326 | _marker_ul = '(?:[%s])' % _marker_ul_chars␊ |
1327 | _marker_ol = r'(?:\d+\.)'␊ |
1328 | ␊ |
1329 | def _list_sub(self, match):␊ |
1330 | lst = match.group(1)␊ |
1331 | lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol"␊ |
1332 | result = self._process_list_items(lst)␊ |
1333 | if self.list_level:␊ |
1334 | return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type)␊ |
1335 | else:␊ |
1336 | return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type)␊ |
1337 | ␊ |
1338 | def _do_lists(self, text):␊ |
1339 | # Form HTML ordered (numbered) and unordered (bulleted) lists.␊ |
1340 | ␊ |
1341 | # Iterate over each *non-overlapping* list match.␊ |
1342 | pos = 0␊ |
1343 | while True:␊ |
1344 | # Find the *first* hit for either list style (ul or ol). We␊ |
1345 | # match ul and ol separately to avoid adjacent lists of different␊ |
1346 | # types running into each other (see issue #16).␊ |
1347 | hits = []␊ |
1348 | for marker_pat in (self._marker_ul, self._marker_ol):␊ |
1349 | less_than_tab = self.tab_width - 1␊ |
1350 | whole_list = r'''␊ |
1351 | ( # \1 = whole list␊ |
1352 | ( # \2␊ |
1353 | [ ]{0,%d}␊ |
1354 | (%s) # \3 = first list item marker␊ |
1355 | [ \t]+␊ |
1356 | (?!\ *\3\ ) # '- - - ...' isn't a list. See 'not_quite_a_list' test case.␊ |
1357 | )␊ |
1358 | (?:.+?)␊ |
1359 | ( # \4␊ |
1360 | \Z␊ |
1361 | |␊ |
1362 | \n{2,}␊ |
1363 | (?=\S)␊ |
1364 | (?! # Negative lookahead for another list item marker␊ |
1365 | [ \t]*␊ |
1366 | %s[ \t]+␊ |
1367 | )␊ |
1368 | )␊ |
1369 | )␊ |
1370 | ''' % (less_than_tab, marker_pat, marker_pat)␊ |
1371 | if self.list_level: # sub-list␊ |
1372 | list_re = re.compile("^"+whole_list, re.X | re.M | re.S)␊ |
1373 | else:␊ |
1374 | list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list,␊ |
1375 | re.X | re.M | re.S)␊ |
1376 | match = list_re.search(text, pos)␊ |
1377 | if match:␊ |
1378 | hits.append((match.start(), match))␊ |
1379 | if not hits:␊ |
1380 | break␊ |
1381 | hits.sort()␊ |
1382 | match = hits[0][1]␊ |
1383 | start, end = match.span()␊ |
1384 | text = text[:start] + self._list_sub(match) + text[end:]␊ |
1385 | pos = end␊ |
1386 | ␊ |
1387 | return text␊ |
1388 | ␊ |
1389 | _list_item_re = re.compile(r'''␊ |
1390 | (\n)? # leading line = \1␊ |
1391 | (^[ \t]*) # leading whitespace = \2␊ |
1392 | (?P<marker>%s) [ \t]+ # list marker = \3␊ |
1393 | ((?:.+?) # list item text = \4␊ |
1394 | (\n{1,2})) # eols = \5␊ |
1395 | (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+))␊ |
1396 | ''' % (_marker_any, _marker_any),␊ |
1397 | re.M | re.X | re.S)␊ |
1398 | ␊ |
1399 | _last_li_endswith_two_eols = False␊ |
1400 | def _list_item_sub(self, match):␊ |
1401 | item = match.group(4)␊ |
1402 | leading_line = match.group(1)␊ |
1403 | leading_space = match.group(2)␊ |
1404 | if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:␊ |
1405 | item = self._run_block_gamut(self._outdent(item))␊ |
1406 | else:␊ |
1407 | # Recursion for sub-lists:␊ |
1408 | item = self._do_lists(self._outdent(item))␊ |
1409 | if item.endswith('\n'):␊ |
1410 | item = item[:-1]␊ |
1411 | item = self._run_span_gamut(item)␊ |
1412 | self._last_li_endswith_two_eols = (len(match.group(5)) == 2)␊ |
1413 | return "<li>%s</li>\n" % item␊ |
1414 | ␊ |
1415 | def _process_list_items(self, list_str):␊ |
1416 | # Process the contents of a single ordered or unordered list,␊ |
1417 | # splitting it into individual list items.␊ |
1418 | ␊ |
1419 | # The $g_list_level global keeps track of when we're inside a list.␊ |
1420 | # Each time we enter a list, we increment it; when we leave a list,␊ |
1421 | # we decrement. If it's zero, we're not in a list anymore.␊ |
1422 | #␊ |
1423 | # We do this because when we're not inside a list, we want to treat␊ |
1424 | # something like this:␊ |
1425 | #␊ |
1426 | # I recommend upgrading to version␊ |
1427 | # 8. Oops, now this line is treated␊ |
1428 | # as a sub-list.␊ |
1429 | #␊ |
1430 | # As a single paragraph, despite the fact that the second line starts␊ |
1431 | # with a digit-period-space sequence.␊ |
1432 | #␊ |
1433 | # Whereas when we're inside a list (or sub-list), that line will be␊ |
1434 | # treated as the start of a sub-list. What a kludge, huh? This is␊ |
1435 | # an aspect of Markdown's syntax that's hard to parse perfectly␊ |
1436 | # without resorting to mind-reading. Perhaps the solution is to␊ |
1437 | # change the syntax rules such that sub-lists must start with a␊ |
1438 | # starting cardinal number; e.g. "1." or "a.".␊ |
1439 | self.list_level += 1␊ |
1440 | self._last_li_endswith_two_eols = False␊ |
1441 | list_str = list_str.rstrip('\n') + '\n'␊ |
1442 | list_str = self._list_item_re.sub(self._list_item_sub, list_str)␊ |
1443 | self.list_level -= 1␊ |
1444 | return list_str␊ |
1445 | ␊ |
1446 | def _get_pygments_lexer(self, lexer_name):␊ |
1447 | try:␊ |
1448 | from pygments import lexers, util␊ |
1449 | except ImportError:␊ |
1450 | return None␊ |
1451 | try:␊ |
1452 | return lexers.get_lexer_by_name(lexer_name)␊ |
1453 | except util.ClassNotFound:␊ |
1454 | return None␊ |
1455 | ␊ |
1456 | def _color_with_pygments(self, codeblock, lexer, **formatter_opts):␊ |
1457 | import pygments␊ |
1458 | import pygments.formatters␊ |
1459 | ␊ |
1460 | class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):␊ |
1461 | def _wrap_code(self, inner):␊ |
1462 | """A function for use in a Pygments Formatter which␊ |
1463 | wraps in <code> tags.␊ |
1464 | """␊ |
1465 | yield 0, "<code>"␊ |
1466 | for tup in inner:␊ |
1467 | yield tup␊ |
1468 | yield 0, "</code>"␊ |
1469 | ␊ |
1470 | def wrap(self, source, outfile):␊ |
1471 | """Return the source with a code, pre, and div."""␊ |
1472 | return self._wrap_div(self._wrap_pre(self._wrap_code(source)))␊ |
1473 | ␊ |
1474 | formatter_opts.setdefault("cssclass", "codehilite")␊ |
1475 | formatter = HtmlCodeFormatter(**formatter_opts)␊ |
1476 | return pygments.highlight(codeblock, lexer, formatter)␊ |
1477 | ␊ |
1478 | def _code_block_sub(self, match, is_fenced_code_block=False):␊ |
1479 | lexer_name = None␊ |
1480 | ␊ |
1481 | if is_fenced_code_block:␊ |
1482 | lexer_name = match.group(1)␊ |
1483 | if lexer_name:␊ |
1484 | formatter_opts = self.extras['fenced-code-blocks'] or {}␊ |
1485 | codeblock = match.group(2)␊ |
1486 | codeblock = codeblock[:-1] # drop one trailing newline␊ |
1487 | else:␊ |
1488 | codeblock = match.group(1)␊ |
1489 | codeblock = self._outdent(codeblock)␊ |
1490 | codeblock = self._detab(codeblock)␊ |
1491 | codeblock = codeblock.lstrip('\n') # trim leading newlines␊ |
1492 | codeblock = codeblock.rstrip() # trim trailing whitespace␊ |
1493 | ␊ |
1494 | # Note: "code-color" extra is DEPRECATED.␊ |
1495 | if "code-color" in self.extras and codeblock.startswith(":::"):␊ |
1496 | lexer_name, rest = codeblock.split('\n', 1)␊ |
1497 | lexer_name = lexer_name[3:].strip()␊ |
1498 | codeblock = rest.lstrip("\n") # Remove lexer declaration line.␊ |
1499 | formatter_opts = self.extras['code-color'] or {}␊ |
1500 | ␊ |
1501 | if lexer_name:␊ |
1502 | lexer = self._get_pygments_lexer(lexer_name)␊ |
1503 | if lexer:␊ |
1504 | colored = self._color_with_pygments(codeblock, lexer,␊ |
1505 | **formatter_opts)␊ |
1506 | return "\n\n%s\n\n" % colored␊ |
1507 | ␊ |
1508 | codeblock = self._encode_code(codeblock)␊ |
1509 | pre_class_str = self._html_class_str_from_tag("pre")␊ |
1510 | code_class_str = self._html_class_str_from_tag("code")␊ |
1511 | return "\n\n<pre%s><code%s>%s\n</code></pre>\n\n" % (␊ |
1512 | pre_class_str, code_class_str, codeblock)␊ |
1513 | ␊ |
1514 | def _html_class_str_from_tag(self, tag):␊ |
1515 | """Get the appropriate ' class="..."' string (note the leading␊ |
1516 | space), if any, for the given tag.␊ |
1517 | """␊ |
1518 | if "html-classes" not in self.extras:␊ |
1519 | return ""␊ |
1520 | try:␊ |
1521 | html_classes_from_tag = self.extras["html-classes"]␊ |
1522 | except TypeError:␊ |
1523 | return ""␊ |
1524 | else:␊ |
1525 | if tag in html_classes_from_tag:␊ |
1526 | return ' class="%s"' % html_classes_from_tag[tag]␊ |
1527 | return ""␊ |
1528 | ␊ |
1529 | def _do_code_blocks(self, text):␊ |
1530 | """Process Markdown `<pre><code>` blocks."""␊ |
1531 | code_block_re = re.compile(r'''␊ |
1532 | (?:\n\n|\A\n?)␊ |
1533 | ( # $1 = the code block -- one or more lines, starting with a space/tab␊ |
1534 | (?:␊ |
1535 | (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces␊ |
1536 | .*\n+␊ |
1537 | )+␊ |
1538 | )␊ |
1539 | ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc␊ |
1540 | ''' % (self.tab_width, self.tab_width),␊ |
1541 | re.M | re.X)␊ |
1542 | return code_block_re.sub(self._code_block_sub, text)␊ |
1543 | ␊ |
1544 | _fenced_code_block_re = re.compile(r'''␊ |
1545 | (?:\n\n|\A\n?)␊ |
1546 | ^```([\w+-]+)?[ \t]*\n # opening fence, $1 = optional lang␊ |
1547 | (.*?) # $2 = code block content␊ |
1548 | ^```[ \t]*\n # closing fence␊ |
1549 | ''', re.M | re.X | re.S)␊ |
1550 | ␊ |
1551 | def _fenced_code_block_sub(self, match):␊ |
1552 | return self._code_block_sub(match, is_fenced_code_block=True);␊ |
1553 | ␊ |
1554 | def _do_fenced_code_blocks(self, text):␊ |
1555 | """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""␊ |
1556 | return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text)␊ |
1557 | ␊ |
1558 | # Rules for a code span:␊ |
1559 | # - backslash escapes are not interpreted in a code span␊ |
1560 | # - to include one or or a run of more backticks the delimiters must␊ |
1561 | # be a longer run of backticks␊ |
1562 | # - cannot start or end a code span with a backtick; pad with a␊ |
1563 | # space and that space will be removed in the emitted HTML␊ |
1564 | # See `test/tm-cases/escapes.text` for a number of edge-case␊ |
1565 | # examples.␊ |
1566 | _code_span_re = re.compile(r'''␊ |
1567 | (?<!\\)␊ |
1568 | (`+) # \1 = Opening run of `␊ |
1569 | (?!`) # See Note A test/tm-cases/escapes.text␊ |
1570 | (.+?) # \2 = The code block␊ |
1571 | (?<!`)␊ |
1572 | \1 # Matching closer␊ |
1573 | (?!`)␊ |
1574 | ''', re.X | re.S)␊ |
1575 | ␊ |
1576 | def _code_span_sub(self, match):␊ |
1577 | c = match.group(2).strip(" \t")␊ |
1578 | c = self._encode_code(c)␊ |
1579 | return "<code>%s</code>" % c␊ |
1580 | ␊ |
1581 | def _do_code_spans(self, text):␊ |
1582 | # * Backtick quotes are used for <code></code> spans.␊ |
1583 | #␊ |
1584 | # * You can use multiple backticks as the delimiters if you want to␊ |
1585 | # include literal backticks in the code span. So, this input:␊ |
1586 | #␊ |
1587 | # Just type ``foo `bar` baz`` at the prompt.␊ |
1588 | #␊ |
1589 | # Will translate to:␊ |
1590 | #␊ |
1591 | # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>␊ |
1592 | #␊ |
1593 | # There's no arbitrary limit to the number of backticks you␊ |
1594 | # can use as delimters. If you need three consecutive backticks␊ |
1595 | # in your code, use four for delimiters, etc.␊ |
1596 | #␊ |
1597 | # * You can use spaces to get literal backticks at the edges:␊ |
1598 | #␊ |
1599 | # ... type `` `bar` `` ...␊ |
1600 | #␊ |
1601 | # Turns to:␊ |
1602 | #␊ |
1603 | # ... type <code>`bar`</code> ...␊ |
1604 | return self._code_span_re.sub(self._code_span_sub, text)␊ |
1605 | ␊ |
1606 | def _encode_code(self, text):␊ |
1607 | """Encode/escape certain characters inside Markdown code runs.␊ |
1608 | The point is that in code, these characters are literals,␊ |
1609 | and lose their special Markdown meanings.␊ |
1610 | """␊ |
1611 | replacements = [␊ |
1612 | # Encode all ampersands; HTML entities are not␊ |
1613 | # entities within a Markdown code span.␊ |
1614 | ('&', '&'),␊ |
1615 | # Do the angle bracket song and dance:␊ |
1616 | ('<', '<'),␊ |
1617 | ('>', '>'),␊ |
1618 | ]␊ |
1619 | for before, after in replacements:␊ |
1620 | text = text.replace(before, after)␊ |
1621 | hashed = _hash_text(text)␊ |
1622 | self._escape_table[text] = hashed␊ |
1623 | return hashed␊ |
1624 | ␊ |
1625 | _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)␊ |
1626 | _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)␊ |
1627 | _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)␊ |
1628 | _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)␊ |
1629 | _code_friendly_line_re = re.compile(r"\~\~(?=\S)(.+?)(?<=\S)\~\~", re.S)␊ |
1630 | _code_friendly_underline_re = re.compile(r"\~(?=\S)(.+?)(?<=\S)\~", re.S)␊ |
1631 | def _do_italics_and_bold(self, text):␊ |
1632 | # <strong> must go first:␊ |
1633 | if "code-friendly" in self.extras:␊ |
1634 | text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text)␊ |
1635 | text = self._code_friendly_em_re.sub(r"<em>\1</em>", text)␊ |
1636 | ␉ text = self._code_friendly_line_re.sub(r"<span style='text-decoration:line-through'>\1</span>", text)␊ |
1637 | ␉ text = self._code_friendly_underline_re.sub(r"<span style='text-decoration:underline'>\1</span>", text)␊ |
1638 | else:␊ |
1639 | text = self._strong_re.sub(r"<strong>\2</strong>", text)␊ |
1640 | text = self._em_re.sub(r"<em>\2</em>", text)␊ |
1641 | ␉ text = self._code_friendly_line_re.sub(r"<span style='text-decoration:line-through'>\1</span>", text)␊ |
1642 | ␉ text = self._code_friendly_underline_re.sub(r"<span style='text-decoration:underline'>\1</span>", text)␊ |
1643 | return text␊ |
1644 | ␊ |
1645 | # "smarty-pants" extra: Very liberal in interpreting a single prime as an␊ |
1646 | # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and␊ |
1647 | # "twixt" can be written without an initial apostrophe. This is fine because␊ |
1648 | # using scare quotes (single quotation marks) is rare.␊ |
1649 | _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")␊ |
1650 | _contractions = ["tis", "twas", "twer", "neath", "o", "n",␊ |
1651 | "round", "bout", "twixt", "nuff", "fraid", "sup"]␊ |
1652 | def _do_smart_contractions(self, text):␊ |
1653 | text = self._apostrophe_year_re.sub(r"’\1", text)␊ |
1654 | for c in self._contractions:␊ |
1655 | text = text.replace("'%s" % c, "’%s" % c)␊ |
1656 | text = text.replace("'%s" % c.capitalize(),␊ |
1657 | "’%s" % c.capitalize())␊ |
1658 | return text␊ |
1659 | ␊ |
1660 | # Substitute double-quotes before single-quotes.␊ |
1661 | _opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)")␊ |
1662 | _opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)')␊ |
1663 | _closing_single_quote_re = re.compile(r"(?<=\S)'")␊ |
1664 | _closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))')␊ |
1665 | def _do_smart_punctuation(self, text):␊ |
1666 | """Fancifies 'single quotes', "double quotes", and apostrophes.␊ |
1667 | Converts --, ---, and ... into en dashes, em dashes, and ellipses.␊ |
1668 | ␊ |
1669 | Inspiration is: <http://daringfireball.net/projects/smartypants/>␊ |
1670 | See "test/tm-cases/smarty_pants.text" for a full discussion of the␊ |
1671 | support here and␊ |
1672 | <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a␊ |
1673 | discussion of some diversion from the original SmartyPants.␊ |
1674 | """␊ |
1675 | if "'" in text: # guard for perf␊ |
1676 | text = self._do_smart_contractions(text)␊ |
1677 | text = self._opening_single_quote_re.sub("‘", text)␊ |
1678 | text = self._closing_single_quote_re.sub("’", text)␊ |
1679 | ␊ |
1680 | if '"' in text: # guard for perf␊ |
1681 | text = self._opening_double_quote_re.sub("“", text)␊ |
1682 | text = self._closing_double_quote_re.sub("”", text)␊ |
1683 | ␊ |
1684 | text = text.replace("---", "—")␊ |
1685 | text = text.replace("--", "–")␊ |
1686 | text = text.replace("...", "…")␊ |
1687 | text = text.replace(" . . . ", "…")␊ |
1688 | text = text.replace(". . .", "…")␊ |
1689 | return text␊ |
1690 | ␊ |
1691 | _block_quote_re = re.compile(r'''␊ |
1692 | ( # Wrap whole match in \1␊ |
1693 | (␊ |
1694 | ^[ \t]*>[ \t]? # '>' at the start of a line␊ |
1695 | .+\n # rest of the first line␊ |
1696 | (.+\n)* # subsequent consecutive lines␊ |
1697 | \n* # blanks␊ |
1698 | )+␊ |
1699 | )␊ |
1700 | ''', re.M | re.X)␊ |
1701 | _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);␊ |
1702 | ␊ |
1703 | _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)␊ |
1704 | def _dedent_two_spaces_sub(self, match):␊ |
1705 | return re.sub(r'(?m)^ ', '', match.group(1))␊ |
1706 | ␊ |
1707 | def _block_quote_sub(self, match):␊ |
1708 | bq = match.group(1)␊ |
1709 | bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting␊ |
1710 | bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines␊ |
1711 | bq = self._run_block_gamut(bq) # recurse␊ |
1712 | ␊ |
1713 | bq = re.sub('(?m)^', ' ', bq)␊ |
1714 | # These leading spaces screw with <pre> content, so we need to fix that:␊ |
1715 | bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)␊ |
1716 | ␊ |
1717 | return "<blockquote>\n%s\n</blockquote>\n\n" % bq␊ |
1718 | ␊ |
1719 | def _do_block_quotes(self, text):␊ |
1720 | if '>' not in text:␊ |
1721 | return text␊ |
1722 | return self._block_quote_re.sub(self._block_quote_sub, text)␊ |
1723 | ␊ |
1724 | def _form_paragraphs(self, text):␊ |
1725 | # Strip leading and trailing lines:␊ |
1726 | text = text.strip('\n')␊ |
1727 | ␊ |
1728 | # Wrap <p> tags.␊ |
1729 | grafs = []␊ |
1730 | for i, graf in enumerate(re.split(r"\n{2,}", text)):␊ |
1731 | if graf in self.html_blocks:␊ |
1732 | # Unhashify HTML blocks␊ |
1733 | grafs.append(self.html_blocks[graf])␊ |
1734 | else:␊ |
1735 | cuddled_list = None␊ |
1736 | if "cuddled-lists" in self.extras:␊ |
1737 | # Need to put back trailing '\n' for `_list_item_re`␊ |
1738 | # match at the end of the paragraph.␊ |
1739 | li = self._list_item_re.search(graf + '\n')␊ |
1740 | # Two of the same list marker in this paragraph: a likely␊ |
1741 | # candidate for a list cuddled to preceding paragraph␊ |
1742 | # text (issue 33). Note the `[-1]` is a quick way to␊ |
1743 | # consider numeric bullets (e.g. "1." and "2.") to be␊ |
1744 | # equal.␊ |
1745 | if (li and len(li.group(2)) <= 3 and li.group("next_marker")␊ |
1746 | and li.group("marker")[-1] == li.group("next_marker")[-1]):␊ |
1747 | start = li.start()␊ |
1748 | cuddled_list = self._do_lists(graf[start:]).rstrip("\n")␊ |
1749 | assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>")␊ |
1750 | graf = graf[:start]␊ |
1751 | ␊ |
1752 | # Wrap <p> tags.␊ |
1753 | graf = self._run_span_gamut(graf)␊ |
1754 | grafs.append("<p>" + graf.lstrip(" \t") + "</p>")␊ |
1755 | ␊ |
1756 | if cuddled_list:␊ |
1757 | grafs.append(cuddled_list)␊ |
1758 | ␊ |
1759 | return "\n\n".join(grafs)␊ |
1760 | ␊ |
1761 | def _add_footnotes(self, text):␊ |
1762 | if self.footnotes:␊ |
1763 | footer = [␊ |
1764 | '<div class="footnotes">',␊ |
1765 | '<hr' + self.empty_element_suffix,␊ |
1766 | '<ol>',␊ |
1767 | ]␊ |
1768 | for i, id in enumerate(self.footnote_ids):␊ |
1769 | if i != 0:␊ |
1770 | footer.append('')␊ |
1771 | footer.append('<li id="fn-%s">' % id)␊ |
1772 | footer.append(self._run_block_gamut(self.footnotes[id]))␊ |
1773 | backlink = ('<a href="#fnref-%s" '␊ |
1774 | 'class="footnoteBackLink" '␊ |
1775 | 'title="Jump back to footnote %d in the text.">'␊ |
1776 | '↩</a>' % (id, i+1))␊ |
1777 | if footer[-1].endswith("</p>"):␊ |
1778 | footer[-1] = footer[-1][:-len("</p>")] \␊ |
1779 | + ' ' + backlink + "</p>"␊ |
1780 | else:␊ |
1781 | footer.append("\n<p>%s</p>" % backlink)␊ |
1782 | footer.append('</li>')␊ |
1783 | footer.append('</ol>')␊ |
1784 | footer.append('</div>')␊ |
1785 | return text + '\n\n' + '\n'.join(footer)␊ |
1786 | else:␊ |
1787 | return text␊ |
1788 | ␊ |
1789 | # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:␊ |
1790 | # http://bumppo.net/projects/amputator/␊ |
1791 | _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')␊ |
1792 | _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)␊ |
1793 | _naked_gt_re = re.compile(r'''(?<![a-z0-9?!/'"-])>''', re.I)␊ |
1794 | ␊ |
1795 | def _encode_amps_and_angles(self, text):␊ |
1796 | # Smart processing for ampersands and angle brackets that need␊ |
1797 | # to be encoded.␊ |
1798 | text = self._ampersand_re.sub('&', text)␊ |
1799 | ␊ |
1800 | # Encode naked <'s␊ |
1801 | text = self._naked_lt_re.sub('<', text)␊ |
1802 | ␊ |
1803 | # Encode naked >'s␊ |
1804 | # Note: Other markdown implementations (e.g. Markdown.pl, PHP␊ |
1805 | # Markdown) don't do this.␊ |
1806 | text = self._naked_gt_re.sub('>', text)␊ |
1807 | return text␊ |
1808 | ␊ |
1809 | def _encode_backslash_escapes(self, text):␊ |
1810 | for ch, escape in list(self._escape_table.items()):␊ |
1811 | text = text.replace("\\"+ch, escape)␊ |
1812 | return text␊ |
1813 | ␊ |
1814 | _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)␊ |
1815 | def _auto_link_sub(self, match):␊ |
1816 | g1 = match.group(1)␊ |
1817 | return '<a href="%s">%s</a>' % (g1, g1)␊ |
1818 | ␊ |
1819 | _auto_email_link_re = re.compile(r"""␊ |
1820 | <␊ |
1821 | (?:mailto:)?␊ |
1822 | (␊ |
1823 | [-.\w]+␊ |
1824 | \@␊ |
1825 | [-\w]+(\.[-\w]+)*\.[a-z]+␊ |
1826 | )␊ |
1827 | >␊ |
1828 | """, re.I | re.X | re.U)␊ |
1829 | def _auto_email_link_sub(self, match):␊ |
1830 | return self._encode_email_address(␊ |
1831 | self._unescape_special_chars(match.group(1)))␊ |
1832 | ␊ |
1833 | def _do_auto_links(self, text):␊ |
1834 | text = self._auto_link_re.sub(self._auto_link_sub, text)␊ |
1835 | text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)␊ |
1836 | return text␊ |
1837 | ␊ |
1838 | def _encode_email_address(self, addr):␊ |
1839 | # Input: an email address, e.g. "foo@example.com"␊ |
1840 | #␊ |
1841 | # Output: the email address as a mailto link, with each character␊ |
1842 | # of the address encoded as either a decimal or hex entity, in␊ |
1843 | # the hopes of foiling most address harvesting spam bots. E.g.:␊ |
1844 | #␊ |
1845 | # <a href="mailto:foo@e␊ |
1846 | # xample.com">foo␊ |
1847 | # @example.com</a>␊ |
1848 | #␊ |
1849 | # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk␊ |
1850 | # mailing list: <http://tinyurl.com/yu7ue>␊ |
1851 | chars = [_xml_encode_email_char_at_random(ch)␊ |
1852 | for ch in "mailto:" + addr]␊ |
1853 | # Strip the mailto: from the visible part.␊ |
1854 | addr = '<a href="%s">%s</a>' \␊ |
1855 | % (''.join(chars), ''.join(chars[7:]))␊ |
1856 | return addr␊ |
1857 | ␊ |
1858 | def _do_link_patterns(self, text):␊ |
1859 | """Caveat emptor: there isn't much guarding against link␊ |
1860 | patterns being formed inside other standard Markdown links, e.g.␊ |
1861 | inside a [link def][like this].␊ |
1862 | ␊ |
1863 | Dev Notes: *Could* consider prefixing regexes with a negative␊ |
1864 | lookbehind assertion to attempt to guard against this.␊ |
1865 | """␊ |
1866 | link_from_hash = {}␊ |
1867 | for regex, repl in self.link_patterns:␊ |
1868 | replacements = []␊ |
1869 | for match in regex.finditer(text):␊ |
1870 | if hasattr(repl, "__call__"):␊ |
1871 | href = repl(match)␊ |
1872 | else:␊ |
1873 | href = match.expand(repl)␊ |
1874 | replacements.append((match.span(), href))␊ |
1875 | for (start, end), href in reversed(replacements):␊ |
1876 | escaped_href = (␊ |
1877 | href.replace('"', '"') # b/c of attr quote␊ |
1878 | # To avoid markdown <em> and <strong>:␊ |
1879 | .replace('*', self._escape_table['*'])␊ |
1880 | .replace('_', self._escape_table['_']))␊ |
1881 | link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])␊ |
1882 | hash = _hash_text(link)␊ |
1883 | link_from_hash[hash] = link␊ |
1884 | text = text[:start] + hash + text[end:]␊ |
1885 | for hash, link in list(link_from_hash.items()):␊ |
1886 | text = text.replace(hash, link)␊ |
1887 | return text␊ |
1888 | ␊ |
1889 | def _unescape_special_chars(self, text):␊ |
1890 | # Swap back in all the special characters we've hidden.␊ |
1891 | for ch, hash in list(self._escape_table.items()):␊ |
1892 | text = text.replace(hash, ch)␊ |
1893 | return text␊ |
1894 | ␊ |
1895 | def _outdent(self, text):␊ |
1896 | # Remove one level of line-leading tabs or spaces␊ |
1897 | return self._outdent_re.sub('', text)␊ |
1898 | ␊ |
1899 | ␊ |
1900 | class MarkdownWithExtras(Markdown):␊ |
1901 | """A markdowner class that enables most extras:␊ |
1902 | ␊ |
1903 | - footnotes␊ |
1904 | - code-color (only has effect if 'pygments' Python module on path)␊ |
1905 | ␊ |
1906 | These are not included:␊ |
1907 | - pyshell (specific to Python-related documenting)␊ |
1908 | - code-friendly (because it *disables* part of the syntax)␊ |
1909 | - link-patterns (because you need to specify some actual␊ |
1910 | link-patterns anyway)␊ |
1911 | """␊ |
1912 | extras = ["footnotes", "code-color"]␊ |
1913 | ␊ |
1914 | ␊ |
1915 | #---- internal support functions␊ |
1916 | ␊ |
1917 | class UnicodeWithAttrs(unicode):␊ |
1918 | """A subclass of unicode used for the return value of conversion to␊ |
1919 | possibly attach some attributes. E.g. the "toc_html" attribute when␊ |
1920 | the "toc" extra is used.␊ |
1921 | """␊ |
1922 | metadata = None␊ |
1923 | _toc = None␊ |
1924 | def toc_html(self):␊ |
1925 | """Return the HTML for the current TOC.␊ |
1926 | ␊ |
1927 | This expects the `_toc` attribute to have been set on this instance.␊ |
1928 | """␊ |
1929 | if self._toc is None:␊ |
1930 | return None␊ |
1931 | ␊ |
1932 | def indent():␊ |
1933 | return ' ' * (len(h_stack) - 1)␊ |
1934 | lines = []␊ |
1935 | h_stack = [0] # stack of header-level numbers␊ |
1936 | for level, id, name in self._toc:␊ |
1937 | if level > h_stack[-1]:␊ |
1938 | lines.append("%s<ul>" % indent())␊ |
1939 | h_stack.append(level)␊ |
1940 | elif level == h_stack[-1]:␊ |
1941 | lines[-1] += "</li>"␊ |
1942 | else:␊ |
1943 | while level < h_stack[-1]:␊ |
1944 | h_stack.pop()␊ |
1945 | if not lines[-1].endswith("</li>"):␊ |
1946 | lines[-1] += "</li>"␊ |
1947 | lines.append("%s</ul></li>" % indent())␊ |
1948 | lines.append('%s<li><a href="#%s">%s</a>' % (␊ |
1949 | indent(), id, name))␊ |
1950 | while len(h_stack) > 1:␊ |
1951 | h_stack.pop()␊ |
1952 | if not lines[-1].endswith("</li>"):␊ |
1953 | lines[-1] += "</li>"␊ |
1954 | lines.append("%s</ul>" % indent())␊ |
1955 | return '\n'.join(lines) + '\n'␊ |
1956 | toc_html = property(toc_html)␊ |
1957 | ␊ |
1958 | ## {{{ http://code.activestate.com/recipes/577257/ (r1)␊ |
1959 | _slugify_strip_re = re.compile(r'[^\w\s-]')␊ |
1960 | _slugify_hyphenate_re = re.compile(r'[-\s]+')␊ |
1961 | def _slugify(value):␊ |
1962 | """␊ |
1963 | Normalizes string, converts to lowercase, removes non-alpha characters,␊ |
1964 | and converts spaces to hyphens.␊ |
1965 | ␊ |
1966 | From Django's "django/template/defaultfilters.py".␊ |
1967 | """␊ |
1968 | import unicodedata␊ |
1969 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()␊ |
1970 | value = _slugify_strip_re.sub('', value).strip().lower()␊ |
1971 | return _slugify_hyphenate_re.sub('-', value)␊ |
1972 | ## end of http://code.activestate.com/recipes/577257/ }}}␊ |
1973 | ␊ |
1974 | ␊ |
1975 | # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549␊ |
1976 | def _curry(*args, **kwargs):␊ |
1977 | function, args = args[0], args[1:]␊ |
1978 | def result(*rest, **kwrest):␊ |
1979 | combined = kwargs.copy()␊ |
1980 | combined.update(kwrest)␊ |
1981 | return function(*args + rest, **combined)␊ |
1982 | return result␊ |
1983 | ␊ |
1984 | # Recipe: regex_from_encoded_pattern (1.0)␊ |
1985 | def _regex_from_encoded_pattern(s):␊ |
1986 | """'foo' -> re.compile(re.escape('foo'))␊ |
1987 | '/foo/' -> re.compile('foo')␊ |
1988 | '/foo/i' -> re.compile('foo', re.I)␊ |
1989 | """␊ |
1990 | if s.startswith('/') and s.rfind('/') != 0:␊ |
1991 | # Parse it: /PATTERN/FLAGS␊ |
1992 | idx = s.rfind('/')␊ |
1993 | pattern, flags_str = s[1:idx], s[idx+1:]␊ |
1994 | flag_from_char = {␊ |
1995 | "i": re.IGNORECASE,␊ |
1996 | "l": re.LOCALE,␊ |
1997 | "s": re.DOTALL,␊ |
1998 | "m": re.MULTILINE,␊ |
1999 | "u": re.UNICODE,␊ |
2000 | }␊ |
2001 | flags = 0␊ |
2002 | for char in flags_str:␊ |
2003 | try:␊ |
2004 | flags |= flag_from_char[char]␊ |
2005 | except KeyError:␊ |
2006 | raise ValueError("unsupported regex flag: '%s' in '%s' "␊ |
2007 | "(must be one of '%s')"␊ |
2008 | % (char, s, ''.join(list(flag_from_char.keys()))))␊ |
2009 | return re.compile(s[1:idx], flags)␊ |
2010 | else: # not an encoded regex␊ |
2011 | return re.compile(re.escape(s))␊ |
2012 | ␊ |
2013 | # Recipe: dedent (0.1.2)␊ |
2014 | def _dedentlines(lines, tabsize=8, skip_first_line=False):␊ |
2015 | """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines␊ |
2016 | ␊ |
2017 | "lines" is a list of lines to dedent.␊ |
2018 | "tabsize" is the tab width to use for indent width calculations.␊ |
2019 | "skip_first_line" is a boolean indicating if the first line should␊ |
2020 | be skipped for calculating the indent width and for dedenting.␊ |
2021 | This is sometimes useful for docstrings and similar.␊ |
2022 | ␊ |
2023 | Same as dedent() except operates on a sequence of lines. Note: the␊ |
2024 | lines list is modified **in-place**.␊ |
2025 | """␊ |
2026 | DEBUG = False␊ |
2027 | if DEBUG:␊ |
2028 | print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\␊ |
2029 | % (tabsize, skip_first_line))␊ |
2030 | indents = []␊ |
2031 | margin = None␊ |
2032 | for i, line in enumerate(lines):␊ |
2033 | if i == 0 and skip_first_line: continue␊ |
2034 | indent = 0␊ |
2035 | for ch in line:␊ |
2036 | if ch == ' ':␊ |
2037 | indent += 1␊ |
2038 | elif ch == '\t':␊ |
2039 | indent += tabsize - (indent % tabsize)␊ |
2040 | elif ch in '\r\n':␊ |
2041 | continue # skip all-whitespace lines␊ |
2042 | else:␊ |
2043 | break␊ |
2044 | else:␊ |
2045 | continue # skip all-whitespace lines␊ |
2046 | if DEBUG: print("dedent: indent=%d: %r" % (indent, line))␊ |
2047 | if margin is None:␊ |
2048 | margin = indent␊ |
2049 | else:␊ |
2050 | margin = min(margin, indent)␊ |
2051 | if DEBUG: print("dedent: margin=%r" % margin)␊ |
2052 | ␊ |
2053 | if margin is not None and margin > 0:␊ |
2054 | for i, line in enumerate(lines):␊ |
2055 | if i == 0 and skip_first_line: continue␊ |
2056 | removed = 0␊ |
2057 | for j, ch in enumerate(line):␊ |
2058 | if ch == ' ':␊ |
2059 | removed += 1␊ |
2060 | elif ch == '\t':␊ |
2061 | removed += tabsize - (removed % tabsize)␊ |
2062 | elif ch in '\r\n':␊ |
2063 | if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)␊ |
2064 | lines[i] = lines[i][j:]␊ |
2065 | break␊ |
2066 | else:␊ |
2067 | raise ValueError("unexpected non-whitespace char %r in "␊ |
2068 | "line %r while removing %d-space margin"␊ |
2069 | % (ch, line, margin))␊ |
2070 | if DEBUG:␊ |
2071 | print("dedent: %r: %r -> removed %d/%d"\␊ |
2072 | % (line, ch, removed, margin))␊ |
2073 | if removed == margin:␊ |
2074 | lines[i] = lines[i][j+1:]␊ |
2075 | break␊ |
2076 | elif removed > margin:␊ |
2077 | lines[i] = ' '*(removed-margin) + lines[i][j+1:]␊ |
2078 | break␊ |
2079 | else:␊ |
2080 | if removed:␊ |
2081 | lines[i] = lines[i][removed:]␊ |
2082 | return lines␊ |
2083 | ␊ |
2084 | def _dedent(text, tabsize=8, skip_first_line=False):␊ |
2085 | """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text␊ |
2086 | ␊ |
2087 | "text" is the text to dedent.␊ |
2088 | "tabsize" is the tab width to use for indent width calculations.␊ |
2089 | "skip_first_line" is a boolean indicating if the first line should␊ |
2090 | be skipped for calculating the indent width and for dedenting.␊ |
2091 | This is sometimes useful for docstrings and similar.␊ |
2092 | ␊ |
2093 | textwrap.dedent(s), but don't expand tabs to spaces␊ |
2094 | """␊ |
2095 | lines = text.splitlines(1)␊ |
2096 | _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)␊ |
2097 | return ''.join(lines)␊ |
2098 | ␊ |
2099 | ␊ |
2100 | class _memoized(object):␊ |
2101 | """Decorator that caches a function's return value each time it is called.␊ |
2102 | If called later with the same arguments, the cached value is returned, and␊ |
2103 | not re-evaluated.␊ |
2104 | ␊ |
2105 | http://wiki.python.org/moin/PythonDecoratorLibrary␊ |
2106 | """␊ |
2107 | def __init__(self, func):␊ |
2108 | self.func = func␊ |
2109 | self.cache = {}␊ |
2110 | def __call__(self, *args):␊ |
2111 | try:␊ |
2112 | return self.cache[args]␊ |
2113 | except KeyError:␊ |
2114 | self.cache[args] = value = self.func(*args)␊ |
2115 | return value␊ |
2116 | except TypeError:␊ |
2117 | # uncachable -- for instance, passing a list as an argument.␊ |
2118 | # Better to not cache than to blow up entirely.␊ |
2119 | return self.func(*args)␊ |
2120 | def __repr__(self):␊ |
2121 | """Return the function's docstring."""␊ |
2122 | return self.func.__doc__␊ |
2123 | ␊ |
2124 | ␊ |
2125 | def _xml_oneliner_re_from_tab_width(tab_width):␊ |
2126 | """Standalone XML processing instruction regex."""␊ |
2127 | return re.compile(r"""␊ |
2128 | (?:␊ |
2129 | (?<=\n\n) # Starting after a blank line␊ |
2130 | | # or␊ |
2131 | \A\n? # the beginning of the doc␊ |
2132 | )␊ |
2133 | ( # save in $1␊ |
2134 | [ ]{0,%d}␊ |
2135 | (?:␊ |
2136 | <\?\w+\b\s+.*?\?> # XML processing instruction␊ |
2137 | |␊ |
2138 | <\w+:\w+\b\s+.*?/> # namespaced single tag␊ |
2139 | )␊ |
2140 | [ \t]*␊ |
2141 | (?=\n{2,}|\Z) # followed by a blank line or end of document␊ |
2142 | )␊ |
2143 | """ % (tab_width - 1), re.X)␊ |
2144 | _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)␊ |
2145 | ␊ |
2146 | def _hr_tag_re_from_tab_width(tab_width):␊ |
2147 | return re.compile(r"""␊ |
2148 | (?:␊ |
2149 | (?<=\n\n) # Starting after a blank line␊ |
2150 | | # or␊ |
2151 | \A\n? # the beginning of the doc␊ |
2152 | )␊ |
2153 | ( # save in \1␊ |
2154 | [ ]{0,%d}␊ |
2155 | <(hr) # start tag = \2␊ |
2156 | \b # word break␊ |
2157 | ([^<>])*? #␊ |
2158 | /?> # the matching end tag␊ |
2159 | [ \t]*␊ |
2160 | (?=\n{2,}|\Z) # followed by a blank line or end of document␊ |
2161 | )␊ |
2162 | """ % (tab_width - 1), re.X)␊ |
2163 | _hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)␊ |
2164 | ␊ |
2165 | ␊ |
2166 | def _xml_escape_attr(attr, skip_single_quote=True):␊ |
2167 | """Escape the given string for use in an HTML/XML tag attribute.␊ |
2168 | ␊ |
2169 | By default this doesn't bother with escaping `'` to `'`, presuming that␊ |
2170 | the tag attribute is surrounded by double quotes.␊ |
2171 | """␊ |
2172 | escaped = (attr␊ |
2173 | .replace('&', '&')␊ |
2174 | .replace('"', '"')␊ |
2175 | .replace('<', '<')␊ |
2176 | .replace('>', '>'))␊ |
2177 | if not skip_single_quote:␊ |
2178 | escaped = escaped.replace("'", "'")␊ |
2179 | return escaped␊ |
2180 | ␊ |
2181 | ␊ |
2182 | def _xml_encode_email_char_at_random(ch):␊ |
2183 | r = random()␊ |
2184 | # Roughly 10% raw, 45% hex, 45% dec.␊ |
2185 | # '@' *must* be encoded. I [John Gruber] insist.␊ |
2186 | # Issue 26: '_' must be encoded.␊ |
2187 | if r > 0.9 and ch not in "@_":␊ |
2188 | return ch␊ |
2189 | elif r < 0.45:␊ |
2190 | # The [1:] is to drop leading '0': 0x63 -> x63␊ |
2191 | return '&#%s;' % hex(ord(ch))[1:]␊ |
2192 | else:␊ |
2193 | return '&#%s;' % ord(ch)␊ |
2194 | ␊ |
2195 | ␊ |
2196 | ␊ |
2197 | #---- mainline␊ |
2198 | ␊ |
2199 | class _NoReflowFormatter(optparse.IndentedHelpFormatter):␊ |
2200 | """An optparse formatter that does NOT reflow the description."""␊ |
2201 | def format_description(self, description):␊ |
2202 | return description or ""␊ |
2203 | ␊ |
2204 | def _test():␊ |
2205 | import doctest␊ |
2206 | doctest.testmod()␊ |
2207 | ␊ |
2208 | def main(argv=None):␊ |
2209 | if argv is None:␊ |
2210 | argv = sys.argv␊ |
2211 | if not logging.root.handlers:␊ |
2212 | logging.basicConfig()␊ |
2213 | ␊ |
2214 | usage = "usage: %prog [PATHS...]"␊ |
2215 | version = "%prog "+__version__␊ |
2216 | parser = optparse.OptionParser(prog="markdown2", usage=usage,␊ |
2217 | version=version, description=cmdln_desc,␊ |
2218 | formatter=_NoReflowFormatter())␊ |
2219 | parser.add_option("-v", "--verbose", dest="log_level",␊ |
2220 | action="store_const", const=logging.DEBUG,␊ |
2221 | help="more verbose output")␊ |
2222 | parser.add_option("--encoding",␊ |
2223 | help="specify encoding of text content")␊ |
2224 | parser.add_option("--html4tags", action="store_true", default=False,␊ |
2225 | help="use HTML 4 style for empty element tags")␊ |
2226 | parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",␊ |
2227 | help="sanitize literal HTML: 'escape' escapes "␊ |
2228 | "HTML meta chars, 'replace' replaces with an "␊ |
2229 | "[HTML_REMOVED] note")␊ |
2230 | parser.add_option("-x", "--extras", action="append",␊ |
2231 | help="Turn on specific extra features (not part of "␊ |
2232 | "the core Markdown spec). See above.")␊ |
2233 | parser.add_option("--use-file-vars",␊ |
2234 | help="Look for and use Emacs-style 'markdown-extras' "␊ |
2235 | "file var to turn on extras. See "␊ |
2236 | "<https://github.com/trentm/python-markdown2/wiki/Extras>")␊ |
2237 | parser.add_option("--link-patterns-file",␊ |
2238 | help="path to a link pattern file")␊ |
2239 | parser.add_option("--self-test", action="store_true",␊ |
2240 | help="run internal self-tests (some doctests)")␊ |
2241 | parser.add_option("--compare", action="store_true",␊ |
2242 | help="run against Markdown.pl as well (for testing)")␊ |
2243 | parser.set_defaults(log_level=logging.INFO, compare=False,␊ |
2244 | encoding="utf-8", safe_mode=None, use_file_vars=False)␊ |
2245 | opts, paths = parser.parse_args()␊ |
2246 | log.setLevel(opts.log_level)␊ |
2247 | ␊ |
2248 | if opts.self_test:␊ |
2249 | return _test()␊ |
2250 | ␊ |
2251 | if opts.extras:␊ |
2252 | extras = {}␊ |
2253 | for s in opts.extras:␊ |
2254 | splitter = re.compile("[,;: ]+")␊ |
2255 | for e in splitter.split(s):␊ |
2256 | if '=' in e:␊ |
2257 | ename, earg = e.split('=', 1)␊ |
2258 | try:␊ |
2259 | earg = int(earg)␊ |
2260 | except ValueError:␊ |
2261 | pass␊ |
2262 | else:␊ |
2263 | ename, earg = e, None␊ |
2264 | extras[ename] = earg␊ |
2265 | else:␊ |
2266 | extras = None␊ |
2267 | ␊ |
2268 | if opts.link_patterns_file:␊ |
2269 | link_patterns = []␊ |
2270 | f = open(opts.link_patterns_file)␊ |
2271 | try:␊ |
2272 | for i, line in enumerate(f.readlines()):␊ |
2273 | if not line.strip(): continue␊ |
2274 | if line.lstrip().startswith("#"): continue␊ |
2275 | try:␊ |
2276 | pat, href = line.rstrip().rsplit(None, 1)␊ |
2277 | except ValueError:␊ |
2278 | raise MarkdownError("%s:%d: invalid link pattern line: %r"␊ |
2279 | % (opts.link_patterns_file, i+1, line))␊ |
2280 | link_patterns.append(␊ |
2281 | (_regex_from_encoded_pattern(pat), href))␊ |
2282 | finally:␊ |
2283 | f.close()␊ |
2284 | else:␊ |
2285 | link_patterns = None␊ |
2286 | ␊ |
2287 | from os.path import join, dirname, abspath, exists␊ |
2288 | markdown_pl = join(dirname(dirname(abspath(__file__))), "test",␊ |
2289 | "Markdown.pl")␊ |
2290 | if not paths:␊ |
2291 | paths = ['-']␊ |
2292 | for path in paths:␊ |
2293 | if path == '-':␊ |
2294 | text = sys.stdin.read()␊ |
2295 | else:␊ |
2296 | fp = codecs.open(path, 'r', opts.encoding)␊ |
2297 | text = fp.read()␊ |
2298 | fp.close()␊ |
2299 | if opts.compare:␊ |
2300 | from subprocess import Popen, PIPE␊ |
2301 | print("==== Markdown.pl ====")␊ |
2302 | p = Popen('perl %s' % markdown_pl, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True)␊ |
2303 | p.stdin.write(text.encode('utf-8'))␊ |
2304 | p.stdin.close()␊ |
2305 | perl_html = p.stdout.read().decode('utf-8')␊ |
2306 | if py3:␊ |
2307 | sys.stdout.write(perl_html)␊ |
2308 | else:␊ |
2309 | sys.stdout.write(perl_html.encode(␊ |
2310 | sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))␊ |
2311 | print("==== markdown2.py ====")␊ |
2312 | html = markdown(text,␊ |
2313 | html4tags=opts.html4tags,␊ |
2314 | safe_mode=opts.safe_mode,␊ |
2315 | extras=extras, link_patterns=link_patterns,␊ |
2316 | use_file_vars=opts.use_file_vars)␊ |
2317 | if py3:␊ |
2318 | sys.stdout.write(html)␊ |
2319 | else:␊ |
2320 | sys.stdout.write(html.encode(␊ |
2321 | sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))␊ |
2322 | if extras and "toc" in extras:␊ |
2323 | log.debug("toc_html: " +␊ |
2324 | html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))␊ |
2325 | if opts.compare:␊ |
2326 | test_dir = join(dirname(dirname(abspath(__file__))), "test")␊ |
2327 | if exists(join(test_dir, "test_markdown2.py")):␊ |
2328 | sys.path.insert(0, test_dir)␊ |
2329 | from test_markdown2 import norm_html_from_html␊ |
2330 | norm_html = norm_html_from_html(html)␊ |
2331 | norm_perl_html = norm_html_from_html(perl_html)␊ |
2332 | else:␊ |
2333 | norm_html = html␊ |
2334 | norm_perl_html = perl_html␊ |
2335 | print("==== match? %r ====" % (norm_perl_html == norm_html))␊ |
2336 | ␊ |
2337 | ␊ |
2338 | if __name__ == "__main__":␊ |
2339 | sys.exit( main(sys.argv) )␊ |