Dénote

Dénote Git Source Tree

Root/denote/markdown2.py

1# -*- coding: utf-8 -*-
2#!/usr/bin/env python
3# Copyright (c) 2012 Trent Mick.
4# Copyright (c) 2007-2008 ActiveState Corp.
5# License: MIT (http://www.opensource.org/licenses/mit-license.php)
6
7from __future__ import generators
8
9r"""A fast and complete Python implementation of Markdown.
10
11[from http://daringfireball.net/projects/markdown/]
12> Markdown is a text-to-HTML filter; it translates an easy-to-read /
13> easy-to-write structured text format into HTML. Markdown's text
14> format is most similar to that of plain text email, and supports
15> features such as headers, *emphasis*, code blocks, blockquotes, and
16> links.
17>
18> Markdown's syntax is designed not as a generic markup language, but
19> specifically to serve as a front-end to (X)HTML. You can use span-level
20> HTML tags anywhere in a Markdown document, and you can use block level
21> HTML tags (like <div> and <table> as well).
22
23Module usage:
24
25 >>> import markdown2
26 >>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)`
27 u'<p><em>boo!</em></p>\n'
28
29 >>> markdowner = Markdown()
30 >>> markdowner.convert("*boo!*")
31 u'<p><em>boo!</em></p>\n'
32 >>> markdowner.convert("**boom!**")
33 u'<p><strong>boom!</strong></p>\n'
34
35This implementation of Markdown implements the full "core" syntax plus a
36number of extras (e.g., code syntax coloring, footnotes) as described on
37<https://github.com/trentm/python-markdown2/wiki/Extras>.
38"""
39
40cmdln_desc = """A fast and complete Python implementation of Markdown, a
41text-to-HTML conversion tool for web writers.
42
43Supported extra syntax options (see -x|--extras option below and
44see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
45
46* code-friendly: Disable _ and __ for em and strong.
47* cuddled-lists: Allow lists to be cuddled to the preceding paragraph.
48* fenced-code-blocks: Allows a code block to not have to be indented
49 by fencing it with '```' on a line before and after. Based on
50 <http://github.github.com/github-flavored-markdown/> with support for
51 syntax highlighting.
52* footnotes: Support footnotes as in use on daringfireball.net and
53 implemented in other Markdown processors (tho not in Markdown.pl v1.0.1).
54* header-ids: Adds "id" attributes to headers. The id value is a slug of
55 the header text.
56* html-classes: Takes a dict mapping html tag names (lowercase) to a
57 string to use for a "class" tag attribute. Currently only supports
58 "pre" and "code" tags. Add an issue if you require this for other tags.
59* markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
60 have markdown processing be done on its contents. Similar to
61 <http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
62 some limitations.
63* metadata: Extract metadata from a leading '---'-fenced block.
64 See <https://github.com/trentm/python-markdown2/issues/77> for details.
65* nofollow: Add `rel="nofollow"` to add `<a>` tags with an href. See
66 <http://en.wikipedia.org/wiki/Nofollow>.
67* pyshell: Treats unindented Python interactive shell sessions as <code>
68 blocks.
69* link-patterns: Auto-link given regex patterns in text (e.g. bug number
70 references, revision number references).
71* smarty-pants: Replaces ' and " with curly quotation marks or curly
72 apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,
73 and ellipses.
74* toc: The returned HTML string gets a new "toc_html" attribute which is
75 a Table of Contents for the document. (experimental)
76* xml: Passes one-liner processing instructions and namespaced XML tags.
77* wiki-tables: Google Code Wiki-style tables. See
78 <http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
79"""
80
81# Dev Notes:
82# - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm
83# not yet sure if there implications with this. Compare 'pydoc sre'
84# and 'perldoc perlre'.
85
86__version_info__ = (2, 1, 1)
87__version__ = '.'.join(map(str, __version_info__))
88__author__ = "Trent Mick"
89
90import os
91import sys
92from pprint import pprint
93import re
94import logging
95try:
96 from hashlib import md5
97except ImportError:
98 from md5 import md5
99import optparse
100from random import random, randint
101import codecs
102
103
104#---- Python version compat
105
106try:
107 from urllib.parse import quote # python3
108except ImportError:
109 from urllib import quote # python2
110
111if sys.version_info[:2] < (2,4):
112 from sets import Set as set
113 def reversed(sequence):
114 for i in sequence[::-1]:
115 yield i
116
117# Use `bytes` for byte strings and `unicode` for unicode strings (str in Py3).
118if sys.version_info[0] <= 2:
119 py3 = False
120 try:
121 bytes
122 except NameError:
123 bytes = str
124 base_string_type = basestring
125elif sys.version_info[0] >= 3:
126 py3 = True
127 unicode = str
128 base_string_type = str
129
130
131
132#---- globals
133
134DEBUG = False
135log = logging.getLogger("markdown")
136
137DEFAULT_TAB_WIDTH = 4
138
139
140SECRET_SALT = bytes(randint(0, 1000000))
141def _hash_text(s):
142 return 'md5-' + md5(SECRET_SALT + s.encode("utf-8")).hexdigest()
143
144# Table of hash values for escaped characters:
145g_escape_table = dict([(ch, _hash_text(ch))
146 for ch in '\\`*_{}[]()>#+-.!'])
147
148
149
150#---- exceptions
151
152class MarkdownError(Exception):
153 pass
154
155
156
157#---- public api
158
159def markdown_path(path, encoding="utf-8",
160 html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
161 safe_mode=None, extras=None, link_patterns=None,
162 use_file_vars=False):
163 fp = codecs.open(path, 'r', encoding)
164 text = fp.read()
165 fp.close()
166 return Markdown(html4tags=html4tags, tab_width=tab_width,
167 safe_mode=safe_mode, extras=extras,
168 link_patterns=link_patterns,
169 use_file_vars=use_file_vars).convert(text)
170
171def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
172 safe_mode=None, extras=None, link_patterns=None,
173 use_file_vars=False):
174 return Markdown(html4tags=html4tags, tab_width=tab_width,
175 safe_mode=safe_mode, extras=extras,
176 link_patterns=link_patterns,
177 use_file_vars=use_file_vars).convert(text)
178
179class Markdown(object):
180 # The dict of "extras" to enable in processing -- a mapping of
181 # extra name to argument for the extra. Most extras do not have an
182 # argument, in which case the value is None.
183 #
184 # This can be set via (a) subclassing and (b) the constructor
185 # "extras" argument.
186 extras = None
187
188 urls = None
189 titles = None
190 html_blocks = None
191 html_spans = None
192 html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py
193
194 # Used to track when we're inside an ordered or unordered list
195 # (see _ProcessListItems() for details):
196 list_level = 0
197
198 _ws_only_line_re = re.compile(r"^[ \t]+$", re.M)
199
200 def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
201 extras=None, link_patterns=None, use_file_vars=False):
202 if html4tags:
203 self.empty_element_suffix = ">"
204 else:
205 self.empty_element_suffix = " />"
206 self.tab_width = tab_width
207
208 # For compatibility with earlier markdown2.py and with
209 # markdown.py's safe_mode being a boolean,
210 # safe_mode == True -> "replace"
211 if safe_mode is True:
212 self.safe_mode = "replace"
213 else:
214 self.safe_mode = safe_mode
215
216 # Massaging and building the "extras" info.
217 if self.extras is None:
218 self.extras = {}
219 elif not isinstance(self.extras, dict):
220 self.extras = dict([(e, None) for e in self.extras])
221 if extras:
222 if not isinstance(extras, dict):
223 extras = dict([(e, None) for e in extras])
224 self.extras.update(extras)
225 assert isinstance(self.extras, dict)
226 if "toc" in self.extras and not "header-ids" in self.extras:
227 self.extras["header-ids"] = None # "toc" implies "header-ids"
228 self._instance_extras = self.extras.copy()
229
230 self.link_patterns = link_patterns
231 self.use_file_vars = use_file_vars
232 self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M)
233
234 self._escape_table = g_escape_table.copy()
235 if "smarty-pants" in self.extras:
236 self._escape_table['"'] = _hash_text('"')
237 self._escape_table["'"] = _hash_text("'")
238
239 def reset(self):
240 self.urls = {}
241 self.titles = {}
242 self.html_blocks = {}
243 self.html_spans = {}
244 self.list_level = 0
245 self.extras = self._instance_extras.copy()
246 if "footnotes" in self.extras:
247 self.footnotes = {}
248 self.footnote_ids = []
249 if "header-ids" in self.extras:
250 self._count_from_header_id = {} # no `defaultdict` in Python 2.4
251 if "metadata" in self.extras:
252 self.metadata = {}
253
254 # Per <https://developer.mozilla.org/en-US/docs/HTML/Element/a> "rel"
255 # should only be used in <a> tags with an "href" attribute.
256 _a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE)
257
258 def convert(self, text):
259 """Convert the given text."""
260 # Main function. The order in which other subs are called here is
261 # essential. Link and image substitutions need to happen before
262 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
263 # and <img> tags get encoded.
264
265 # Clear the global hashes. If we don't clear these, you get conflicts
266 # from other articles when generating a page which contains more than
267 # one article (e.g. an index page that shows the N most recent
268 # articles):
269 self.reset()
270
271 if not isinstance(text, unicode):
272 #TODO: perhaps shouldn't presume UTF-8 for string input?
273 text = unicode(text, 'utf-8')
274
275 if self.use_file_vars:
276 # Look for emacs-style file variable hints.
277 emacs_vars = self._get_emacs_vars(text)
278 if "markdown-extras" in emacs_vars:
279 splitter = re.compile("[ ,]+")
280 for e in splitter.split(emacs_vars["markdown-extras"]):
281 if '=' in e:
282 ename, earg = e.split('=', 1)
283 try:
284 earg = int(earg)
285 except ValueError:
286 pass
287 else:
288 ename, earg = e, None
289 self.extras[ename] = earg
290
291 # Standardize line endings:
292 text = re.sub("\r\n|\r", "\n", text)
293
294 # Make sure $text ends with a couple of newlines:
295 text += "\n\n"
296
297 # Convert all tabs to spaces.
298 text = self._detab(text)
299
300 # Strip any lines consisting only of spaces and tabs.
301 # This makes subsequent regexen easier to write, because we can
302 # match consecutive blank lines with /\n+/ instead of something
303 # contorted like /[ \t]*\n+/ .
304 text = self._ws_only_line_re.sub("", text)
305
306 # strip metadata from head and extract
307 if "metadata" in self.extras:
308 text = self._extract_metadata(text)
309
310 text = self.preprocess(text)
311
312 if self.safe_mode:
313 text = self._hash_html_spans(text)
314
315 # Turn block-level HTML blocks into hash entries
316 text = self._hash_html_blocks(text, raw=True)
317
318 if "fenced-code-blocks" in self.extras:
319 text = self._do_fenced_code_blocks(text)
320
321 # Strip link definitions, store in hashes.
322 if "footnotes" in self.extras:
323 # Must do footnotes first because an unlucky footnote defn
324 # looks like a link defn:
325 # [^4]: this "looks like a link defn"
326 text = self._strip_footnote_definitions(text)
327 text = self._strip_link_definitions(text)
328
329 text = self._run_block_gamut(text)
330
331 if "footnotes" in self.extras:
332 text = self._add_footnotes(text)
333
334 text = self.postprocess(text)
335
336 text = self._unescape_special_chars(text)
337
338 if self.safe_mode:
339 text = self._unhash_html_spans(text)
340
341 if "nofollow" in self.extras:
342 text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text)
343
344 text += "\n"
345
346 rv = UnicodeWithAttrs(text)
347 if "toc" in self.extras:
348 rv._toc = self._toc
349 if "metadata" in self.extras:
350 rv.metadata = self.metadata
351 return rv
352
353 def postprocess(self, text):
354 """A hook for subclasses to do some postprocessing of the html, if
355 desired. This is called before unescaping of special chars and
356 unhashing of raw HTML spans.
357 """
358 return text
359
360 def preprocess(self, text):
361 """A hook for subclasses to do some preprocessing of the Markdown, if
362 desired. This is called after basic formatting of the text, but prior
363 to any extras, safe mode, etc. processing.
364 """
365 return text
366
367 # Is metadata if the content starts with '---'-fenced `key: value`
368 # pairs. E.g. (indented for presentation):
369 # ---
370 # foo: bar
371 # another-var: blah blah
372 # ---
373 _metadata_pat = re.compile("""^---[ \t]*\n((?:[ \t]*[^ \t:]+[ \t]*:[^\n]*\n)+)---[ \t]*\n""")
374
375 def _extract_metadata(self, text):
376 # fast test
377 if not text.startswith("---"):
378 return text
379 match = self._metadata_pat.match(text)
380 if not match:
381 return text
382
383 tail = text[len(match.group(0)):]
384 metadata_str = match.group(1).strip()
385 for line in metadata_str.split('\n'):
386 key, value = line.split(':', 1)
387 self.metadata[key.strip()] = value.strip()
388
389 return tail
390
391
392 _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
393 # This regular expression is intended to match blocks like this:
394 # PREFIX Local Variables: SUFFIX
395 # PREFIX mode: Tcl SUFFIX
396 # PREFIX End: SUFFIX
397 # Some notes:
398 # - "[ \t]" is used instead of "\s" to specifically exclude newlines
399 # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does
400 # not like anything other than Unix-style line terminators.
401 _emacs_local_vars_pat = re.compile(r"""^
402 (?P<prefix>(?:[^\r\n|\n|\r])*?)
403 [\ \t]*Local\ Variables:[\ \t]*
404 (?P<suffix>.*?)(?:\r\n|\n|\r)
405 (?P<content>.*?\1End:)
406 """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
407
408 def _get_emacs_vars(self, text):
409 """Return a dictionary of emacs-style local variables.
410
411 Parsing is done loosely according to this spec (and according to
412 some in-practice deviations from this):
413 http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
414 """
415 emacs_vars = {}
416 SIZE = pow(2, 13) # 8kB
417
418 # Search near the start for a '-*-'-style one-liner of variables.
419 head = text[:SIZE]
420 if "-*-" in head:
421 match = self._emacs_oneliner_vars_pat.search(head)
422 if match:
423 emacs_vars_str = match.group(1)
424 assert '\n' not in emacs_vars_str
425 emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')
426 if s.strip()]
427 if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:
428 # While not in the spec, this form is allowed by emacs:
429 # -*- Tcl -*-
430 # where the implied "variable" is "mode". This form
431 # is only allowed if there are no other variables.
432 emacs_vars["mode"] = emacs_var_strs[0].strip()
433 else:
434 for emacs_var_str in emacs_var_strs:
435 try:
436 variable, value = emacs_var_str.strip().split(':', 1)
437 except ValueError:
438 log.debug("emacs variables error: malformed -*- "
439 "line: %r", emacs_var_str)
440 continue
441 # Lowercase the variable name because Emacs allows "Mode"
442 # or "mode" or "MoDe", etc.
443 emacs_vars[variable.lower()] = value.strip()
444
445 tail = text[-SIZE:]
446 if "Local Variables" in tail:
447 match = self._emacs_local_vars_pat.search(tail)
448 if match:
449 prefix = match.group("prefix")
450 suffix = match.group("suffix")
451 lines = match.group("content").splitlines(0)
452 #print "prefix=%r, suffix=%r, content=%r, lines: %s"\
453 # % (prefix, suffix, match.group("content"), lines)
454
455 # Validate the Local Variables block: proper prefix and suffix
456 # usage.
457 for i, line in enumerate(lines):
458 if not line.startswith(prefix):
459 log.debug("emacs variables error: line '%s' "
460 "does not use proper prefix '%s'"
461 % (line, prefix))
462 return {}
463 # Don't validate suffix on last line. Emacs doesn't care,
464 # neither should we.
465 if i != len(lines)-1 and not line.endswith(suffix):
466 log.debug("emacs variables error: line '%s' "
467 "does not use proper suffix '%s'"
468 % (line, suffix))
469 return {}
470
471 # Parse out one emacs var per line.
472 continued_for = None
473 for line in lines[:-1]: # no var on the last line ("PREFIX End:")
474 if prefix: line = line[len(prefix):] # strip prefix
475 if suffix: line = line[:-len(suffix)] # strip suffix
476 line = line.strip()
477 if continued_for:
478 variable = continued_for
479 if line.endswith('\\'):
480 line = line[:-1].rstrip()
481 else:
482 continued_for = None
483 emacs_vars[variable] += ' ' + line
484 else:
485 try:
486 variable, value = line.split(':', 1)
487 except ValueError:
488 log.debug("local variables error: missing colon "
489 "in local variables entry: '%s'" % line)
490 continue
491 # Do NOT lowercase the variable name, because Emacs only
492 # allows "mode" (and not "Mode", "MoDe", etc.) in this block.
493 value = value.strip()
494 if value.endswith('\\'):
495 value = value[:-1].rstrip()
496 continued_for = variable
497 else:
498 continued_for = None
499 emacs_vars[variable] = value
500
501 # Unquote values.
502 for var, val in list(emacs_vars.items()):
503 if len(val) > 1 and (val.startswith('"') and val.endswith('"')
504 or val.startswith('"') and val.endswith('"')):
505 emacs_vars[var] = val[1:-1]
506
507 return emacs_vars
508
509 # Cribbed from a post by Bart Lateur:
510 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
511 _detab_re = re.compile(r'(.*?)\t', re.M)
512 def _detab_sub(self, match):
513 g1 = match.group(1)
514 return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
515 def _detab(self, text):
516 r"""Remove (leading?) tabs from a file.
517
518 >>> m = Markdown()
519 >>> m._detab("\tfoo")
520 ' foo'
521 >>> m._detab(" \tfoo")
522 ' foo'
523 >>> m._detab("\t foo")
524 ' foo'
525 >>> m._detab(" foo")
526 ' foo'
527 >>> m._detab(" foo\n\tbar\tblam")
528 ' foo\n bar blam'
529 """
530 if '\t' not in text:
531 return text
532 return self._detab_re.subn(self._detab_sub, text)[0]
533
534 # I broke out the html5 tags here and add them to _block_tags_a and
535 # _block_tags_b. This way html5 tags are easy to keep track of.
536 _html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption'
537
538 _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del'
539 _block_tags_a += _html5tags
540
541 _strict_tag_block_re = re.compile(r"""
542 ( # save in \1
543 ^ # start of line (with re.M)
544 <(%s) # start tag = \2
545 \b # word break
546 (.*\n)*? # any number of lines, minimally matching
547 </\2> # the matching end tag
548 [ \t]* # trailing spaces/tabs
549 (?=\n+|\Z) # followed by a newline or end of document
550 )
551 """ % _block_tags_a,
552 re.X | re.M)
553
554 _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math'
555 _block_tags_b += _html5tags
556
557 _liberal_tag_block_re = re.compile(r"""
558 ( # save in \1
559 ^ # start of line (with re.M)
560 <(%s) # start tag = \2
561 \b # word break
562 (.*\n)*? # any number of lines, minimally matching
563 .*</\2> # the matching end tag
564 [ \t]* # trailing spaces/tabs
565 (?=\n+|\Z) # followed by a newline or end of document
566 )
567 """ % _block_tags_b,
568 re.X | re.M)
569
570 _html_markdown_attr_re = re.compile(
571 r'''\s+markdown=("1"|'1')''')
572 def _hash_html_block_sub(self, match, raw=False):
573 html = match.group(1)
574 if raw and self.safe_mode:
575 html = self._sanitize_html(html)
576 elif 'markdown-in-html' in self.extras and 'markdown=' in html:
577 first_line = html.split('\n', 1)[0]
578 m = self._html_markdown_attr_re.search(first_line)
579 if m:
580 lines = html.split('\n')
581 middle = '\n'.join(lines[1:-1])
582 last_line = lines[-1]
583 first_line = first_line[:m.start()] + first_line[m.end():]
584 f_key = _hash_text(first_line)
585 self.html_blocks[f_key] = first_line
586 l_key = _hash_text(last_line)
587 self.html_blocks[l_key] = last_line
588 return ''.join(["\n\n", f_key,
589 "\n\n", middle, "\n\n",
590 l_key, "\n\n"])
591 key = _hash_text(html)
592 self.html_blocks[key] = html
593 return "\n\n" + key + "\n\n"
594
595 def _hash_html_blocks(self, text, raw=False):
596 """Hashify HTML blocks
597
598 We only want to do this for block-level HTML tags, such as headers,
599 lists, and tables. That's because we still want to wrap <p>s around
600 "paragraphs" that are wrapped in non-block-level tags, such as anchors,
601 phrase emphasis, and spans. The list of tags we're looking for is
602 hard-coded.
603
604 @param raw {boolean} indicates if these are raw HTML blocks in
605 the original source. It makes a difference in "safe" mode.
606 """
607 if '<' not in text:
608 return text
609
610 # Pass `raw` value into our calls to self._hash_html_block_sub.
611 hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
612
613 # First, look for nested blocks, e.g.:
614 # <div>
615 # <div>
616 # tags for inner block must be indented.
617 # </div>
618 # </div>
619 #
620 # The outermost tags must start at the left margin for this to match, and
621 # the inner nested divs must be indented.
622 # We need to do this before the next, more liberal match, because the next
623 # match will start at the first `<div>` and stop at the first `</div>`.
624 text = self._strict_tag_block_re.sub(hash_html_block_sub, text)
625
626 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
627 text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
628
629 # Special case just for <hr />. It was easier to make a special
630 # case than to make the other regex more complicated.
631 if "<hr" in text:
632 _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
633 text = _hr_tag_re.sub(hash_html_block_sub, text)
634
635 # Special case for standalone HTML comments:
636 if "<!--" in text:
637 start = 0
638 while True:
639 # Delimiters for next comment block.
640 try:
641 start_idx = text.index("<!--", start)
642 except ValueError:
643 break
644 try:
645 end_idx = text.index("-->", start_idx) + 3
646 except ValueError:
647 break
648
649 # Start position for next comment block search.
650 start = end_idx
651
652 # Validate whitespace before comment.
653 if start_idx:
654 # - Up to `tab_width - 1` spaces before start_idx.
655 for i in range(self.tab_width - 1):
656 if text[start_idx - 1] != ' ':
657 break
658 start_idx -= 1
659 if start_idx == 0:
660 break
661 # - Must be preceded by 2 newlines or hit the start of
662 # the document.
663 if start_idx == 0:
664 pass
665 elif start_idx == 1 and text[0] == '\n':
666 start_idx = 0 # to match minute detail of Markdown.pl regex
667 elif text[start_idx-2:start_idx] == '\n\n':
668 pass
669 else:
670 break
671
672 # Validate whitespace after comment.
673 # - Any number of spaces and tabs.
674 while end_idx < len(text):
675 if text[end_idx] not in ' \t':
676 break
677 end_idx += 1
678 # - Must be following by 2 newlines or hit end of text.
679 if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'):
680 continue
681
682 # Escape and hash (must match `_hash_html_block_sub`).
683 html = text[start_idx:end_idx]
684 if raw and self.safe_mode:
685 html = self._sanitize_html(html)
686 key = _hash_text(html)
687 self.html_blocks[key] = html
688 text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:]
689
690 if "xml" in self.extras:
691 # Treat XML processing instructions and namespaced one-liner
692 # tags as if they were block HTML tags. E.g., if standalone
693 # (i.e. are their own paragraph), the following do not get
694 # wrapped in a <p> tag:
695 # <?foo bar?>
696 #
697 # <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>
698 _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
699 text = _xml_oneliner_re.sub(hash_html_block_sub, text)
700
701 return text
702
703 def _strip_link_definitions(self, text):
704 # Strips link definitions from text, stores the URLs and titles in
705 # hash references.
706 less_than_tab = self.tab_width - 1
707
708 # Link defs are in the form:
709 # [id]: url "optional title"
710 _link_def_re = re.compile(r"""
711 ^[ ]{0,%d}\[(.+)\]: # id = \1
712 [ \t]*
713 \n? # maybe *one* newline
714 [ \t]*
715 <?(.+?)>? # url = \2
716 [ \t]*
717 (?:
718 \n? # maybe one newline
719 [ \t]*
720 (?<=\s) # lookbehind for whitespace
721 ['"(]
722 ([^\n]*) # title = \3
723 ['")]
724 [ \t]*
725 )? # title is optional
726 (?:\n+|\Z)
727 """ % less_than_tab, re.X | re.M | re.U)
728 return _link_def_re.sub(self._extract_link_def_sub, text)
729
730 def _extract_link_def_sub(self, match):
731 id, url, title = match.groups()
732 key = id.lower() # Link IDs are case-insensitive
733 self.urls[key] = self._encode_amps_and_angles(url)
734 if title:
735 self.titles[key] = title
736 return ""
737
738 def _extract_footnote_def_sub(self, match):
739 id, text = match.groups()
740 text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
741 normed_id = re.sub(r'\W', '-', id)
742 # Ensure footnote text ends with a couple newlines (for some
743 # block gamut matches).
744 self.footnotes[normed_id] = text + "\n\n"
745 return ""
746
747 def _strip_footnote_definitions(self, text):
748 """A footnote definition looks like this:
749
750 [^note-id]: Text of the note.
751
752 May include one or more indented paragraphs.
753
754 Where,
755 - The 'note-id' can be pretty much anything, though typically it
756 is the number of the footnote.
757 - The first paragraph may start on the next line, like so:
758
759 [^note-id]:
760 Text of the note.
761 """
762 less_than_tab = self.tab_width - 1
763 footnote_def_re = re.compile(r'''
764 ^[ ]{0,%d}\[\^(.+)\]: # id = \1
765 [ \t]*
766 ( # footnote text = \2
767 # First line need not start with the spaces.
768 (?:\s*.*\n+)
769 (?:
770 (?:[ ]{%d} | \t) # Subsequent lines must be indented.
771 .*\n+
772 )*
773 )
774 # Lookahead for non-space at line-start, or end of doc.
775 (?:(?=^[ ]{0,%d}\S)|\Z)
776 ''' % (less_than_tab, self.tab_width, self.tab_width),
777 re.X | re.M)
778 return footnote_def_re.sub(self._extract_footnote_def_sub, text)
779
780
781 _hr_data = [
782 ('*', re.compile(r"^[ ]{0,3}\*(.*?)$", re.M)),
783 ('-', re.compile(r"^[ ]{0,3}\-(.*?)$", re.M)),
784 ('_', re.compile(r"^[ ]{0,3}\_(.*?)$", re.M)),
785 ]
786
787 def _run_block_gamut(self, text):
788 # These are all the transformations that form block-level
789 # tags like paragraphs, headers, and list items.
790
791 if "fenced-code-blocks" in self.extras:
792 text = self._do_fenced_code_blocks(text)
793
794 text = self._do_headers(text)
795
796 # Do Horizontal Rules:
797 # On the number of spaces in horizontal rules: The spec is fuzzy: "If
798 # you wish, you may use spaces between the hyphens or asterisks."
799 # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
800 # hr chars to one or two. We'll reproduce that limit here.
801 hr = "\n<hr"+self.empty_element_suffix+"\n"
802 for ch, regex in self._hr_data:
803 if ch in text:
804 for m in reversed(list(regex.finditer(text))):
805 tail = m.group(1).rstrip()
806 if not tail.strip(ch + ' ') and tail.count(" ") == 0:
807 start, end = m.span()
808 text = text[:start] + hr + text[end:]
809
810 text = self._do_lists(text)
811
812 if "pyshell" in self.extras:
813 text = self._prepare_pyshell_blocks(text)
814 if "wiki-tables" in self.extras:
815 text = self._do_wiki_tables(text)
816
817 text = self._do_code_blocks(text)
818
819 text = self._do_block_quotes(text)
820
821 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
822 # was to escape raw HTML in the original Markdown source. This time,
823 # we're escaping the markup we've just created, so that we don't wrap
824 # <p> tags around block-level tags.
825 text = self._hash_html_blocks(text)
826
827 text = self._form_paragraphs(text)
828
829 return text
830
831 def _pyshell_block_sub(self, match):
832 lines = match.group(0).splitlines(0)
833 _dedentlines(lines)
834 indent = ' ' * self.tab_width
835 s = ('\n' # separate from possible cuddled paragraph
836 + indent + ('\n'+indent).join(lines)
837 + '\n\n')
838 return s
839
840 def _prepare_pyshell_blocks(self, text):
841 """Ensure that Python interactive shell sessions are put in
842 code blocks -- even if not properly indented.
843 """
844 if ">>>" not in text:
845 return text
846
847 less_than_tab = self.tab_width - 1
848 _pyshell_block_re = re.compile(r"""
849 ^([ ]{0,%d})>>>[ ].*\n # first line
850 ^(\1.*\S+.*\n)* # any number of subsequent lines
851 ^\n # ends with a blank line
852 """ % less_than_tab, re.M | re.X)
853
854 return _pyshell_block_re.sub(self._pyshell_block_sub, text)
855
856 def _wiki_table_sub(self, match):
857 ttext = match.group(0).strip()
858 #print 'wiki table: %r' % match.group(0)
859 rows = []
860 for line in ttext.splitlines(0):
861 line = line.strip()[2:-2].strip()
862 row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
863 rows.append(row)
864 #pprint(rows)
865 hlines = ['<table>', '<tbody>']
866 for row in rows:
867 hrow = ['<tr>']
868 for cell in row:
869 hrow.append('<td>')
870 hrow.append(self._run_span_gamut(cell))
871 hrow.append('</td>')
872 hrow.append('</tr>')
873 hlines.append(''.join(hrow))
874 hlines += ['</tbody>', '</table>']
875 return '\n'.join(hlines) + '\n'
876
877 def _do_wiki_tables(self, text):
878 # Optimization.
879 if "||" not in text:
880 return text
881
882 less_than_tab = self.tab_width - 1
883 wiki_table_re = re.compile(r'''
884 (?:(?<=\n\n)|\A\n?) # leading blank line
885 ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line
886 (^\1\|\|.+?\|\|\n)* # any number of subsequent lines
887 ''' % less_than_tab, re.M | re.X)
888 return wiki_table_re.sub(self._wiki_table_sub, text)
889
890 def _run_span_gamut(self, text):
891 # These are all the transformations that occur *within* block-level
892 # tags like paragraphs, headers, and list items.
893
894 text = self._do_code_spans(text)
895
896 text = self._escape_special_chars(text)
897
898 # Process anchor and image tags.
899 text = self._do_links(text)
900
901 # Make links out of things like `<http://example.com/>`
902 # Must come after _do_links(), because you can use < and >
903 # delimiters in inline links like [this](<url>).
904 text = self._do_auto_links(text)
905
906 if "link-patterns" in self.extras:
907 text = self._do_link_patterns(text)
908
909 text = self._encode_amps_and_angles(text)
910
911 text = self._do_italics_and_bold(text)
912
913 if "smarty-pants" in self.extras:
914 text = self._do_smart_punctuation(text)
915
916 # Do hard breaks:
917 text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
918
919 return text
920
921 # "Sorta" because auto-links are identified as "tag" tokens.
922 _sorta_html_tokenize_re = re.compile(r"""
923 (
924 # tag
925 </?
926 (?:\w+) # tag name
927 (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes
928 \s*/?>
929 |
930 # auto-link (e.g., <http://www.activestate.com/>)
931 <\w+[^>]*>
932 |
933 <!--.*?--> # comment
934 |
935 <\?.*?\?> # processing instruction
936 )
937 """, re.X)
938
939 def _escape_special_chars(self, text):
940 # Python markdown note: the HTML tokenization here differs from
941 # that in Markdown.pl, hence the behaviour for subtle cases can
942 # differ (I believe the tokenizer here does a better job because
943 # it isn't susceptible to unmatched '<' and '>' in HTML tags).
944 # Note, however, that '>' is not allowed in an auto-link URL
945 # here.
946 escaped = []
947 is_html_markup = False
948 for token in self._sorta_html_tokenize_re.split(text):
949 if is_html_markup:
950 # Within tags/HTML-comments/auto-links, encode * and _
951 # so they don't conflict with their use in Markdown for
952 # italics and strong. We're replacing each such
953 # character with its corresponding MD5 checksum value;
954 # this is likely overkill, but it should prevent us from
955 # colliding with the escape values by accident.
956 escaped.append(token.replace('*', self._escape_table['*'])
957 .replace('_', self._escape_table['_']))
958 else:
959 escaped.append(self._encode_backslash_escapes(token))
960 is_html_markup = not is_html_markup
961 return ''.join(escaped)
962
963 def _hash_html_spans(self, text):
964 # Used for safe_mode.
965
966 def _is_auto_link(s):
967 if ':' in s and self._auto_link_re.match(s):
968 return True
969 elif '@' in s and self._auto_email_link_re.match(s):
970 return True
971 return False
972
973 tokens = []
974 is_html_markup = False
975 for token in self._sorta_html_tokenize_re.split(text):
976 if is_html_markup and not _is_auto_link(token):
977 sanitized = self._sanitize_html(token)
978 key = _hash_text(sanitized)
979 self.html_spans[key] = sanitized
980 tokens.append(key)
981 else:
982 tokens.append(token)
983 is_html_markup = not is_html_markup
984 return ''.join(tokens)
985
986 def _unhash_html_spans(self, text):
987 for key, sanitized in list(self.html_spans.items()):
988 text = text.replace(key, sanitized)
989 return text
990
991 def _sanitize_html(self, s):
992 if self.safe_mode == "replace":
993 return self.html_removed_text
994 elif self.safe_mode == "escape":
995 replacements = [
996 ('&', '&amp;'),
997 ('<', '&lt;'),
998 ('>', '&gt;'),
999 ]
1000 for before, after in replacements:
1001 s = s.replace(before, after)
1002 return s
1003 else:
1004 raise MarkdownError("invalid value for 'safe_mode': %r (must be "
1005 "'escape' or 'replace')" % self.safe_mode)
1006
1007 _tail_of_inline_link_re = re.compile(r'''
1008 # Match tail of: [text](/url/) or [text](/url/ "title")
1009 \( # literal paren
1010 [ \t]*
1011 (?P<url> # \1
1012 <.*?>
1013 |
1014 .*?
1015 )
1016 [ \t]*
1017 ( # \2
1018 (['"]) # quote char = \3
1019 (?P<title>.*?)
1020 \3 # matching quote
1021 )? # title is optional
1022 \)
1023 ''', re.X | re.S)
1024 _tail_of_reference_link_re = re.compile(r'''
1025 # Match tail of: [text][id]
1026 [ ]? # one optional space
1027 (?:\n[ ]*)? # one optional newline followed by spaces
1028 \[
1029 (?P<id>.*?)
1030 \]
1031 ''', re.X | re.S)
1032
1033 def _do_links(self, text):
1034 """Turn Markdown link shortcuts into XHTML <a> and <img> tags.
1035
1036 This is a combination of Markdown.pl's _DoAnchors() and
1037 _DoImages(). They are done together because that simplified the
1038 approach. It was necessary to use a different approach than
1039 Markdown.pl because of the lack of atomic matching support in
1040 Python's regex engine used in $g_nested_brackets.
1041 """
1042 MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24
1043
1044 # `anchor_allowed_pos` is used to support img links inside
1045 # anchors, but not anchors inside anchors. An anchor's start
1046 # pos must be `>= anchor_allowed_pos`.
1047 anchor_allowed_pos = 0
1048
1049 curr_pos = 0
1050 while True: # Handle the next link.
1051 # The next '[' is the start of:
1052 # - an inline anchor: [text](url "title")
1053 # - a reference anchor: [text][id]
1054 # - an inline img: ![text](url "title")
1055 # - a reference img: ![text][id]
1056 # - a footnote ref: [^id]
1057 # (Only if 'footnotes' extra enabled)
1058 # - a footnote defn: [^id]: ...
1059 # (Only if 'footnotes' extra enabled) These have already
1060 # been stripped in _strip_footnote_definitions() so no
1061 # need to watch for them.
1062 # - a link definition: [id]: url "title"
1063 # These have already been stripped in
1064 # _strip_link_definitions() so no need to watch for them.
1065 # - not markup: [...anything else...
1066 try:
1067 start_idx = text.index('[', curr_pos)
1068 except ValueError:
1069 break
1070 text_length = len(text)
1071
1072 # Find the matching closing ']'.
1073 # Markdown.pl allows *matching* brackets in link text so we
1074 # will here too. Markdown.pl *doesn't* currently allow
1075 # matching brackets in img alt text -- we'll differ in that
1076 # regard.
1077 bracket_depth = 0
1078 for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
1079 text_length)):
1080 ch = text[p]
1081 if ch == ']':
1082 bracket_depth -= 1
1083 if bracket_depth < 0:
1084 break
1085 elif ch == '[':
1086 bracket_depth += 1
1087 else:
1088 # Closing bracket not found within sentinel length.
1089 # This isn't markup.
1090 curr_pos = start_idx + 1
1091 continue
1092 link_text = text[start_idx+1:p]
1093
1094 # Possibly a footnote ref?
1095 if "footnotes" in self.extras and link_text.startswith("^"):
1096 normed_id = re.sub(r'\W', '-', link_text[1:])
1097 if normed_id in self.footnotes:
1098 self.footnote_ids.append(normed_id)
1099 result = '<sup class="footnote-ref" id="fnref-%s">' \
1100 '<a href="#fn-%s">%s</a></sup>' \
1101 % (normed_id, normed_id, len(self.footnote_ids))
1102 text = text[:start_idx] + result + text[p+1:]
1103 else:
1104 # This id isn't defined, leave the markup alone.
1105 curr_pos = p+1
1106 continue
1107
1108 # Now determine what this is by the remainder.
1109 p += 1
1110 if p == text_length:
1111 return text
1112
1113 # Inline anchor or img?
1114 if text[p] == '(': # attempt at perf improvement
1115 match = self._tail_of_inline_link_re.match(text, p)
1116 if match:
1117 # Handle an inline anchor or img.
1118 is_img = start_idx > 0 and text[start_idx-1] == "!"
1119 if is_img:
1120 start_idx -= 1
1121
1122 is_inline_img = start_idx > 0 and text[start_idx-1] == "#"
1123 if is_inline_img:
1124 start_idx -= 1
1125 is_img = 1
1126
1127 url, title = match.group("url"), match.group("title")
1128 if url and url[0] == '<':
1129 url = url[1:-1] # '<url>' -> 'url'
1130 # We've got to encode these to avoid conflicting
1131 # with italics/bold.
1132 url = url.replace('*', self._escape_table['*']) \
1133 .replace('_', self._escape_table['_'])
1134 if title:
1135 title_str = ' title="%s"' % (
1136 _xml_escape_attr(title)
1137 .replace('*', self._escape_table['*'])
1138 .replace('_', self._escape_table['_']))
1139 else:
1140 title_str = ''
1141 if is_img:
1142 if is_inline_img:
1143result = '<img class="inlineimage" src="%s" alt="%s"%s%s' \
1144 % (url.replace('"', '&quot;'),
1145 _xml_escape_attr(link_text),
1146 title_str, self.empty_element_suffix)
1147 else:
1148 result = '<img src="%s" alt="%s"%s%s' \
1149% (url.replace('"', '&quot;'),
1150 _xml_escape_attr(link_text),
1151 title_str, self.empty_element_suffix)
1152 if "smarty-pants" in self.extras:
1153 result = result.replace('"', self._escape_table['"'])
1154 curr_pos = start_idx + len(result)
1155 text = text[:start_idx] + result + text[match.end():]
1156 elif start_idx >= anchor_allowed_pos:
1157 result_head = '<a href="%s"%s>' % (url, title_str)
1158 result = '%s%s</a>' % (result_head, link_text)
1159 if "smarty-pants" in self.extras:
1160 result = result.replace('"', self._escape_table['"'])
1161 # <img> allowed from curr_pos on, <a> from
1162 # anchor_allowed_pos on.
1163 curr_pos = start_idx + len(result_head)
1164 anchor_allowed_pos = start_idx + len(result)
1165 text = text[:start_idx] + result + text[match.end():]
1166 else:
1167 # Anchor not allowed here.
1168 curr_pos = start_idx + 1
1169 continue
1170
1171 # Reference anchor or img?
1172 else:
1173 match = self._tail_of_reference_link_re.match(text, p)
1174 if match:
1175 # Handle a reference-style anchor or img.
1176 is_img = start_idx > 0 and text[start_idx-1] == "!"
1177 if is_img:
1178 start_idx -= 1
1179 link_id = match.group("id").lower()
1180 if not link_id:
1181 link_id = link_text.lower() # for links like [this][]
1182 if link_id in self.urls:
1183 url = self.urls[link_id]
1184 # We've got to encode these to avoid conflicting
1185 # with italics/bold.
1186 url = url.replace('*', self._escape_table['*']) \
1187 .replace('_', self._escape_table['_'])
1188 title = self.titles.get(link_id)
1189 if title:
1190 before = title
1191 title = _xml_escape_attr(title) \
1192 .replace('*', self._escape_table['*']) \
1193 .replace('_', self._escape_table['_'])
1194 title_str = ' title="%s"' % title
1195 else:
1196 title_str = ''
1197 if is_img:
1198 result = '<img src="%s" alt="%s"%s%s' \
1199 % (url.replace('"', '&quot;'),
1200 link_text.replace('"', '&quot;'),
1201 title_str, self.empty_element_suffix)
1202 if "smarty-pants" in self.extras:
1203 result = result.replace('"', self._escape_table['"'])
1204 curr_pos = start_idx + len(result)
1205 text = text[:start_idx] + result + text[match.end():]
1206 elif start_idx >= anchor_allowed_pos:
1207 result = '<a href="%s"%s>%s</a>' \
1208 % (url, title_str, link_text)
1209 result_head = '<a href="%s"%s>' % (url, title_str)
1210 result = '%s%s</a>' % (result_head, link_text)
1211 if "smarty-pants" in self.extras:
1212 result = result.replace('"', self._escape_table['"'])
1213 # <img> allowed from curr_pos on, <a> from
1214 # anchor_allowed_pos on.
1215 curr_pos = start_idx + len(result_head)
1216 anchor_allowed_pos = start_idx + len(result)
1217 text = text[:start_idx] + result + text[match.end():]
1218 else:
1219 # Anchor not allowed here.
1220 curr_pos = start_idx + 1
1221 else:
1222 # This id isn't defined, leave the markup alone.
1223 curr_pos = match.end()
1224 continue
1225
1226 # Otherwise, it isn't markup.
1227 curr_pos = start_idx + 1
1228
1229 return text
1230
1231 def header_id_from_text(self, text, prefix, n):
1232 """Generate a header id attribute value from the given header
1233 HTML content.
1234
1235 This is only called if the "header-ids" extra is enabled.
1236 Subclasses may override this for different header ids.
1237
1238 @param text {str} The text of the header tag
1239 @param prefix {str} The requested prefix for header ids. This is the
1240 value of the "header-ids" extra key, if any. Otherwise, None.
1241 @param n {int} The <hN> tag number, i.e. `1` for an <h1> tag.
1242 @returns {str} The value for the header tag's "id" attribute. Return
1243 None to not have an id attribute and to exclude this header from
1244 the TOC (if the "toc" extra is specified).
1245 """
1246 header_id = _slugify(text)
1247 if prefix and isinstance(prefix, base_string_type):
1248 header_id = prefix + '-' + header_id
1249 if header_id in self._count_from_header_id:
1250 self._count_from_header_id[header_id] += 1
1251 header_id += '-%s' % self._count_from_header_id[header_id]
1252 else:
1253 self._count_from_header_id[header_id] = 1
1254 return header_id
1255
1256 _toc = None
1257 def _toc_add_entry(self, level, id, name):
1258 if self._toc is None:
1259 self._toc = []
1260 self._toc.append((level, id, self._unescape_special_chars(name)))
1261
1262 _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
1263 def _setext_h_sub(self, match):
1264 n = {"=": 1, "-": 2}[match.group(2)[0]]
1265 demote_headers = self.extras.get("demote-headers")
1266 if demote_headers:
1267 n = min(n + demote_headers, 6)
1268 header_id_attr = ""
1269 if "header-ids" in self.extras:
1270 header_id = self.header_id_from_text(match.group(1),
1271 self.extras["header-ids"], n)
1272 if header_id:
1273 header_id_attr = ' id="%s"' % header_id
1274 html = self._run_span_gamut(match.group(1))
1275 if "toc" in self.extras and header_id:
1276 self._toc_add_entry(n, header_id, html)
1277 return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
1278
1279 _atx_h_re = re.compile(r'''
1280 ^(\#{1,6}) # \1 = string of #'s
1281 [ \t]+
1282 (.+?) # \2 = Header text
1283 [ \t]*
1284 (?<!\\) # ensure not an escaped trailing '#'
1285 \#* # optional closing #'s (not counted)
1286 \n+
1287 ''', re.X | re.M)
1288 def _atx_h_sub(self, match):
1289 n = len(match.group(1))
1290 demote_headers = self.extras.get("demote-headers")
1291 if demote_headers:
1292 n = min(n + demote_headers, 6)
1293 header_id_attr = ""
1294 if "header-ids" in self.extras:
1295 header_id = self.header_id_from_text(match.group(2),
1296 self.extras["header-ids"], n)
1297 if header_id:
1298 header_id_attr = ' id="%s"' % header_id
1299 html = self._run_span_gamut(match.group(2))
1300 if "toc" in self.extras and header_id:
1301 self._toc_add_entry(n, header_id, html)
1302 return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
1303
1304 def _do_headers(self, text):
1305 # Setext-style headers:
1306 # Header 1
1307 # ========
1308 #
1309 # Header 2
1310 # --------
1311 text = self._setext_h_re.sub(self._setext_h_sub, text)
1312
1313 # atx-style headers:
1314 # # Header 1
1315 # ## Header 2
1316 # ## Header 2 with closing hashes ##
1317 # ...
1318 # ###### Header 6
1319 text = self._atx_h_re.sub(self._atx_h_sub, text)
1320
1321 return text
1322
1323
1324 _marker_ul_chars = '*+-'
1325 _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
1326 _marker_ul = '(?:[%s])' % _marker_ul_chars
1327 _marker_ol = r'(?:\d+\.)'
1328
1329 def _list_sub(self, match):
1330 lst = match.group(1)
1331 lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol"
1332 result = self._process_list_items(lst)
1333 if self.list_level:
1334 return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type)
1335 else:
1336 return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type)
1337
1338 def _do_lists(self, text):
1339 # Form HTML ordered (numbered) and unordered (bulleted) lists.
1340
1341 # Iterate over each *non-overlapping* list match.
1342 pos = 0
1343 while True:
1344 # Find the *first* hit for either list style (ul or ol). We
1345 # match ul and ol separately to avoid adjacent lists of different
1346 # types running into each other (see issue #16).
1347 hits = []
1348 for marker_pat in (self._marker_ul, self._marker_ol):
1349 less_than_tab = self.tab_width - 1
1350 whole_list = r'''
1351 ( # \1 = whole list
1352 ( # \2
1353 [ ]{0,%d}
1354 (%s) # \3 = first list item marker
1355 [ \t]+
1356 (?!\ *\3\ ) # '- - - ...' isn't a list. See 'not_quite_a_list' test case.
1357 )
1358 (?:.+?)
1359 ( # \4
1360 \Z
1361 |
1362 \n{2,}
1363 (?=\S)
1364 (?! # Negative lookahead for another list item marker
1365 [ \t]*
1366 %s[ \t]+
1367 )
1368 )
1369 )
1370 ''' % (less_than_tab, marker_pat, marker_pat)
1371 if self.list_level: # sub-list
1372 list_re = re.compile("^"+whole_list, re.X | re.M | re.S)
1373 else:
1374 list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list,
1375 re.X | re.M | re.S)
1376 match = list_re.search(text, pos)
1377 if match:
1378 hits.append((match.start(), match))
1379 if not hits:
1380 break
1381 hits.sort()
1382 match = hits[0][1]
1383 start, end = match.span()
1384 text = text[:start] + self._list_sub(match) + text[end:]
1385 pos = end
1386
1387 return text
1388
1389 _list_item_re = re.compile(r'''
1390 (\n)? # leading line = \1
1391 (^[ \t]*) # leading whitespace = \2
1392 (?P<marker>%s) [ \t]+ # list marker = \3
1393 ((?:.+?) # list item text = \4
1394 (\n{1,2})) # eols = \5
1395 (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+))
1396 ''' % (_marker_any, _marker_any),
1397 re.M | re.X | re.S)
1398
1399 _last_li_endswith_two_eols = False
1400 def _list_item_sub(self, match):
1401 item = match.group(4)
1402 leading_line = match.group(1)
1403 leading_space = match.group(2)
1404 if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
1405 item = self._run_block_gamut(self._outdent(item))
1406 else:
1407 # Recursion for sub-lists:
1408 item = self._do_lists(self._outdent(item))
1409 if item.endswith('\n'):
1410 item = item[:-1]
1411 item = self._run_span_gamut(item)
1412 self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
1413 return "<li>%s</li>\n" % item
1414
1415 def _process_list_items(self, list_str):
1416 # Process the contents of a single ordered or unordered list,
1417 # splitting it into individual list items.
1418
1419 # The $g_list_level global keeps track of when we're inside a list.
1420 # Each time we enter a list, we increment it; when we leave a list,
1421 # we decrement. If it's zero, we're not in a list anymore.
1422 #
1423 # We do this because when we're not inside a list, we want to treat
1424 # something like this:
1425 #
1426 # I recommend upgrading to version
1427 # 8. Oops, now this line is treated
1428 # as a sub-list.
1429 #
1430 # As a single paragraph, despite the fact that the second line starts
1431 # with a digit-period-space sequence.
1432 #
1433 # Whereas when we're inside a list (or sub-list), that line will be
1434 # treated as the start of a sub-list. What a kludge, huh? This is
1435 # an aspect of Markdown's syntax that's hard to parse perfectly
1436 # without resorting to mind-reading. Perhaps the solution is to
1437 # change the syntax rules such that sub-lists must start with a
1438 # starting cardinal number; e.g. "1." or "a.".
1439 self.list_level += 1
1440 self._last_li_endswith_two_eols = False
1441 list_str = list_str.rstrip('\n') + '\n'
1442 list_str = self._list_item_re.sub(self._list_item_sub, list_str)
1443 self.list_level -= 1
1444 return list_str
1445
1446 def _get_pygments_lexer(self, lexer_name):
1447 try:
1448 from pygments import lexers, util
1449 except ImportError:
1450 return None
1451 try:
1452 return lexers.get_lexer_by_name(lexer_name)
1453 except util.ClassNotFound:
1454 return None
1455
1456 def _color_with_pygments(self, codeblock, lexer, **formatter_opts):
1457 import pygments
1458 import pygments.formatters
1459
1460 class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):
1461 def _wrap_code(self, inner):
1462 """A function for use in a Pygments Formatter which
1463 wraps in <code> tags.
1464 """
1465 yield 0, "<code>"
1466 for tup in inner:
1467 yield tup
1468 yield 0, "</code>"
1469
1470 def wrap(self, source, outfile):
1471 """Return the source with a code, pre, and div."""
1472 return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
1473
1474 formatter_opts.setdefault("cssclass", "codehilite")
1475 formatter = HtmlCodeFormatter(**formatter_opts)
1476 return pygments.highlight(codeblock, lexer, formatter)
1477
1478 def _code_block_sub(self, match, is_fenced_code_block=False):
1479 lexer_name = None
1480
1481 if is_fenced_code_block:
1482 lexer_name = match.group(1)
1483 if lexer_name:
1484 formatter_opts = self.extras['fenced-code-blocks'] or {}
1485 codeblock = match.group(2)
1486 codeblock = codeblock[:-1] # drop one trailing newline
1487 else:
1488 codeblock = match.group(1)
1489 codeblock = self._outdent(codeblock)
1490 codeblock = self._detab(codeblock)
1491 codeblock = codeblock.lstrip('\n') # trim leading newlines
1492 codeblock = codeblock.rstrip() # trim trailing whitespace
1493
1494 # Note: "code-color" extra is DEPRECATED.
1495 if "code-color" in self.extras and codeblock.startswith(":::"):
1496 lexer_name, rest = codeblock.split('\n', 1)
1497 lexer_name = lexer_name[3:].strip()
1498 codeblock = rest.lstrip("\n") # Remove lexer declaration line.
1499 formatter_opts = self.extras['code-color'] or {}
1500
1501 if lexer_name:
1502 lexer = self._get_pygments_lexer(lexer_name)
1503 if lexer:
1504 colored = self._color_with_pygments(codeblock, lexer,
1505 **formatter_opts)
1506 return "\n\n%s\n\n" % colored
1507
1508 codeblock = self._encode_code(codeblock)
1509 pre_class_str = self._html_class_str_from_tag("pre")
1510 code_class_str = self._html_class_str_from_tag("code")
1511 return "\n\n<pre%s><code%s>%s\n</code></pre>\n\n" % (
1512 pre_class_str, code_class_str, codeblock)
1513
1514 def _html_class_str_from_tag(self, tag):
1515 """Get the appropriate ' class="..."' string (note the leading
1516 space), if any, for the given tag.
1517 """
1518 if "html-classes" not in self.extras:
1519 return ""
1520 try:
1521 html_classes_from_tag = self.extras["html-classes"]
1522 except TypeError:
1523 return ""
1524 else:
1525 if tag in html_classes_from_tag:
1526 return ' class="%s"' % html_classes_from_tag[tag]
1527 return ""
1528
1529 def _do_code_blocks(self, text):
1530 """Process Markdown `<pre><code>` blocks."""
1531 code_block_re = re.compile(r'''
1532 (?:\n\n|\A\n?)
1533 ( # $1 = the code block -- one or more lines, starting with a space/tab
1534 (?:
1535 (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces
1536 .*\n+
1537 )+
1538 )
1539 ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1540 ''' % (self.tab_width, self.tab_width),
1541 re.M | re.X)
1542 return code_block_re.sub(self._code_block_sub, text)
1543
1544 _fenced_code_block_re = re.compile(r'''
1545 (?:\n\n|\A\n?)
1546 ^```([\w+-]+)?[ \t]*\n # opening fence, $1 = optional lang
1547 (.*?) # $2 = code block content
1548 ^```[ \t]*\n # closing fence
1549 ''', re.M | re.X | re.S)
1550
1551 def _fenced_code_block_sub(self, match):
1552 return self._code_block_sub(match, is_fenced_code_block=True);
1553
1554 def _do_fenced_code_blocks(self, text):
1555 """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
1556 return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text)
1557
1558 # Rules for a code span:
1559 # - backslash escapes are not interpreted in a code span
1560 # - to include one or or a run of more backticks the delimiters must
1561 # be a longer run of backticks
1562 # - cannot start or end a code span with a backtick; pad with a
1563 # space and that space will be removed in the emitted HTML
1564 # See `test/tm-cases/escapes.text` for a number of edge-case
1565 # examples.
1566 _code_span_re = re.compile(r'''
1567 (?<!\\)
1568 (`+) # \1 = Opening run of `
1569 (?!`) # See Note A test/tm-cases/escapes.text
1570 (.+?) # \2 = The code block
1571 (?<!`)
1572 \1 # Matching closer
1573 (?!`)
1574 ''', re.X | re.S)
1575
1576 def _code_span_sub(self, match):
1577 c = match.group(2).strip(" \t")
1578 c = self._encode_code(c)
1579 return "<code>%s</code>" % c
1580
1581 def _do_code_spans(self, text):
1582 # * Backtick quotes are used for <code></code> spans.
1583 #
1584 # * You can use multiple backticks as the delimiters if you want to
1585 # include literal backticks in the code span. So, this input:
1586 #
1587 # Just type ``foo `bar` baz`` at the prompt.
1588 #
1589 # Will translate to:
1590 #
1591 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1592 #
1593 # There's no arbitrary limit to the number of backticks you
1594 # can use as delimters. If you need three consecutive backticks
1595 # in your code, use four for delimiters, etc.
1596 #
1597 # * You can use spaces to get literal backticks at the edges:
1598 #
1599 # ... type `` `bar` `` ...
1600 #
1601 # Turns to:
1602 #
1603 # ... type <code>`bar`</code> ...
1604 return self._code_span_re.sub(self._code_span_sub, text)
1605
1606 def _encode_code(self, text):
1607 """Encode/escape certain characters inside Markdown code runs.
1608 The point is that in code, these characters are literals,
1609 and lose their special Markdown meanings.
1610 """
1611 replacements = [
1612 # Encode all ampersands; HTML entities are not
1613 # entities within a Markdown code span.
1614 ('&', '&amp;'),
1615 # Do the angle bracket song and dance:
1616 ('<', '&lt;'),
1617 ('>', '&gt;'),
1618 ]
1619 for before, after in replacements:
1620 text = text.replace(before, after)
1621 hashed = _hash_text(text)
1622 self._escape_table[text] = hashed
1623 return hashed
1624
1625 _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
1626 _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
1627 _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
1628 _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
1629 _code_friendly_line_re = re.compile(r"\~\~(?=\S)(.+?)(?<=\S)\~\~", re.S)
1630 _code_friendly_underline_re = re.compile(r"\~(?=\S)(.+?)(?<=\S)\~", re.S)
1631 def _do_italics_and_bold(self, text):
1632 # <strong> must go first:
1633 if "code-friendly" in self.extras:
1634 text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text)
1635 text = self._code_friendly_em_re.sub(r"<em>\1</em>", text)
1636 text = self._code_friendly_line_re.sub(r"<span style='text-decoration:line-through'>\1</span>", text)
1637 text = self._code_friendly_underline_re.sub(r"<span style='text-decoration:underline'>\1</span>", text)
1638 else:
1639 text = self._strong_re.sub(r"<strong>\2</strong>", text)
1640 text = self._em_re.sub(r"<em>\2</em>", text)
1641 text = self._code_friendly_line_re.sub(r"<span style='text-decoration:line-through'>\1</span>", text)
1642 text = self._code_friendly_underline_re.sub(r"<span style='text-decoration:underline'>\1</span>", text)
1643 return text
1644
1645 # "smarty-pants" extra: Very liberal in interpreting a single prime as an
1646 # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
1647 # "twixt" can be written without an initial apostrophe. This is fine because
1648 # using scare quotes (single quotation marks) is rare.
1649 _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")
1650 _contractions = ["tis", "twas", "twer", "neath", "o", "n",
1651 "round", "bout", "twixt", "nuff", "fraid", "sup"]
1652 def _do_smart_contractions(self, text):
1653 text = self._apostrophe_year_re.sub(r"&#8217;\1", text)
1654 for c in self._contractions:
1655 text = text.replace("'%s" % c, "&#8217;%s" % c)
1656 text = text.replace("'%s" % c.capitalize(),
1657 "&#8217;%s" % c.capitalize())
1658 return text
1659
1660 # Substitute double-quotes before single-quotes.
1661 _opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)")
1662 _opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)')
1663 _closing_single_quote_re = re.compile(r"(?<=\S)'")
1664 _closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))')
1665 def _do_smart_punctuation(self, text):
1666 """Fancifies 'single quotes', "double quotes", and apostrophes.
1667 Converts --, ---, and ... into en dashes, em dashes, and ellipses.
1668
1669 Inspiration is: <http://daringfireball.net/projects/smartypants/>
1670 See "test/tm-cases/smarty_pants.text" for a full discussion of the
1671 support here and
1672 <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a
1673 discussion of some diversion from the original SmartyPants.
1674 """
1675 if "'" in text: # guard for perf
1676 text = self._do_smart_contractions(text)
1677 text = self._opening_single_quote_re.sub("&#8216;", text)
1678 text = self._closing_single_quote_re.sub("&#8217;", text)
1679
1680 if '"' in text: # guard for perf
1681 text = self._opening_double_quote_re.sub("&#8220;", text)
1682 text = self._closing_double_quote_re.sub("&#8221;", text)
1683
1684 text = text.replace("---", "&#8212;")
1685 text = text.replace("--", "&#8211;")
1686 text = text.replace("...", "&#8230;")
1687 text = text.replace(" . . . ", "&#8230;")
1688 text = text.replace(". . .", "&#8230;")
1689 return text
1690
1691 _block_quote_re = re.compile(r'''
1692 ( # Wrap whole match in \1
1693 (
1694 ^[ \t]*>[ \t]? # '>' at the start of a line
1695 .+\n # rest of the first line
1696 (.+\n)* # subsequent consecutive lines
1697 \n* # blanks
1698 )+
1699 )
1700 ''', re.M | re.X)
1701 _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
1702
1703 _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
1704 def _dedent_two_spaces_sub(self, match):
1705 return re.sub(r'(?m)^ ', '', match.group(1))
1706
1707 def _block_quote_sub(self, match):
1708 bq = match.group(1)
1709 bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting
1710 bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines
1711 bq = self._run_block_gamut(bq) # recurse
1712
1713 bq = re.sub('(?m)^', ' ', bq)
1714 # These leading spaces screw with <pre> content, so we need to fix that:
1715 bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
1716
1717 return "<blockquote>\n%s\n</blockquote>\n\n" % bq
1718
1719 def _do_block_quotes(self, text):
1720 if '>' not in text:
1721 return text
1722 return self._block_quote_re.sub(self._block_quote_sub, text)
1723
1724 def _form_paragraphs(self, text):
1725 # Strip leading and trailing lines:
1726 text = text.strip('\n')
1727
1728 # Wrap <p> tags.
1729 grafs = []
1730 for i, graf in enumerate(re.split(r"\n{2,}", text)):
1731 if graf in self.html_blocks:
1732 # Unhashify HTML blocks
1733 grafs.append(self.html_blocks[graf])
1734 else:
1735 cuddled_list = None
1736 if "cuddled-lists" in self.extras:
1737 # Need to put back trailing '\n' for `_list_item_re`
1738 # match at the end of the paragraph.
1739 li = self._list_item_re.search(graf + '\n')
1740 # Two of the same list marker in this paragraph: a likely
1741 # candidate for a list cuddled to preceding paragraph
1742 # text (issue 33). Note the `[-1]` is a quick way to
1743 # consider numeric bullets (e.g. "1." and "2.") to be
1744 # equal.
1745 if (li and len(li.group(2)) <= 3 and li.group("next_marker")
1746 and li.group("marker")[-1] == li.group("next_marker")[-1]):
1747 start = li.start()
1748 cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
1749 assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>")
1750 graf = graf[:start]
1751
1752 # Wrap <p> tags.
1753 graf = self._run_span_gamut(graf)
1754 grafs.append("<p>" + graf.lstrip(" \t") + "</p>")
1755
1756 if cuddled_list:
1757 grafs.append(cuddled_list)
1758
1759 return "\n\n".join(grafs)
1760
1761 def _add_footnotes(self, text):
1762 if self.footnotes:
1763 footer = [
1764 '<div class="footnotes">',
1765 '<hr' + self.empty_element_suffix,
1766 '<ol>',
1767 ]
1768 for i, id in enumerate(self.footnote_ids):
1769 if i != 0:
1770 footer.append('')
1771 footer.append('<li id="fn-%s">' % id)
1772 footer.append(self._run_block_gamut(self.footnotes[id]))
1773 backlink = ('<a href="#fnref-%s" '
1774 'class="footnoteBackLink" '
1775 'title="Jump back to footnote %d in the text.">'
1776 '&#8617;</a>' % (id, i+1))
1777 if footer[-1].endswith("</p>"):
1778 footer[-1] = footer[-1][:-len("</p>")] \
1779 + '&nbsp;' + backlink + "</p>"
1780 else:
1781 footer.append("\n<p>%s</p>" % backlink)
1782 footer.append('</li>')
1783 footer.append('</ol>')
1784 footer.append('</div>')
1785 return text + '\n\n' + '\n'.join(footer)
1786 else:
1787 return text
1788
1789 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1790 # http://bumppo.net/projects/amputator/
1791 _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
1792 _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
1793 _naked_gt_re = re.compile(r'''(?<![a-z0-9?!/'"-])>''', re.I)
1794
1795 def _encode_amps_and_angles(self, text):
1796 # Smart processing for ampersands and angle brackets that need
1797 # to be encoded.
1798 text = self._ampersand_re.sub('&amp;', text)
1799
1800 # Encode naked <'s
1801 text = self._naked_lt_re.sub('&lt;', text)
1802
1803 # Encode naked >'s
1804 # Note: Other markdown implementations (e.g. Markdown.pl, PHP
1805 # Markdown) don't do this.
1806 text = self._naked_gt_re.sub('&gt;', text)
1807 return text
1808
1809 def _encode_backslash_escapes(self, text):
1810 for ch, escape in list(self._escape_table.items()):
1811 text = text.replace("\\"+ch, escape)
1812 return text
1813
1814 _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
1815 def _auto_link_sub(self, match):
1816 g1 = match.group(1)
1817 return '<a href="%s">%s</a>' % (g1, g1)
1818
1819 _auto_email_link_re = re.compile(r"""
1820 <
1821 (?:mailto:)?
1822 (
1823 [-.\w]+
1824 \@
1825 [-\w]+(\.[-\w]+)*\.[a-z]+
1826 )
1827 >
1828 """, re.I | re.X | re.U)
1829 def _auto_email_link_sub(self, match):
1830 return self._encode_email_address(
1831 self._unescape_special_chars(match.group(1)))
1832
1833 def _do_auto_links(self, text):
1834 text = self._auto_link_re.sub(self._auto_link_sub, text)
1835 text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
1836 return text
1837
1838 def _encode_email_address(self, addr):
1839 # Input: an email address, e.g. "foo@example.com"
1840 #
1841 # Output: the email address as a mailto link, with each character
1842 # of the address encoded as either a decimal or hex entity, in
1843 # the hopes of foiling most address harvesting spam bots. E.g.:
1844 #
1845 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1846 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1847 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1848 #
1849 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1850 # mailing list: <http://tinyurl.com/yu7ue>
1851 chars = [_xml_encode_email_char_at_random(ch)
1852 for ch in "mailto:" + addr]
1853 # Strip the mailto: from the visible part.
1854 addr = '<a href="%s">%s</a>' \
1855 % (''.join(chars), ''.join(chars[7:]))
1856 return addr
1857
1858 def _do_link_patterns(self, text):
1859 """Caveat emptor: there isn't much guarding against link
1860 patterns being formed inside other standard Markdown links, e.g.
1861 inside a [link def][like this].
1862
1863 Dev Notes: *Could* consider prefixing regexes with a negative
1864 lookbehind assertion to attempt to guard against this.
1865 """
1866 link_from_hash = {}
1867 for regex, repl in self.link_patterns:
1868 replacements = []
1869 for match in regex.finditer(text):
1870 if hasattr(repl, "__call__"):
1871 href = repl(match)
1872 else:
1873 href = match.expand(repl)
1874 replacements.append((match.span(), href))
1875 for (start, end), href in reversed(replacements):
1876 escaped_href = (
1877 href.replace('"', '&quot;') # b/c of attr quote
1878 # To avoid markdown <em> and <strong>:
1879 .replace('*', self._escape_table['*'])
1880 .replace('_', self._escape_table['_']))
1881 link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
1882 hash = _hash_text(link)
1883 link_from_hash[hash] = link
1884 text = text[:start] + hash + text[end:]
1885 for hash, link in list(link_from_hash.items()):
1886 text = text.replace(hash, link)
1887 return text
1888
1889 def _unescape_special_chars(self, text):
1890 # Swap back in all the special characters we've hidden.
1891 for ch, hash in list(self._escape_table.items()):
1892 text = text.replace(hash, ch)
1893 return text
1894
1895 def _outdent(self, text):
1896 # Remove one level of line-leading tabs or spaces
1897 return self._outdent_re.sub('', text)
1898
1899
1900class MarkdownWithExtras(Markdown):
1901 """A markdowner class that enables most extras:
1902
1903 - footnotes
1904 - code-color (only has effect if 'pygments' Python module on path)
1905
1906 These are not included:
1907 - pyshell (specific to Python-related documenting)
1908 - code-friendly (because it *disables* part of the syntax)
1909 - link-patterns (because you need to specify some actual
1910 link-patterns anyway)
1911 """
1912 extras = ["footnotes", "code-color"]
1913
1914
1915#---- internal support functions
1916
1917class UnicodeWithAttrs(unicode):
1918 """A subclass of unicode used for the return value of conversion to
1919 possibly attach some attributes. E.g. the "toc_html" attribute when
1920 the "toc" extra is used.
1921 """
1922 metadata = None
1923 _toc = None
1924 def toc_html(self):
1925 """Return the HTML for the current TOC.
1926
1927 This expects the `_toc` attribute to have been set on this instance.
1928 """
1929 if self._toc is None:
1930 return None
1931
1932 def indent():
1933 return ' ' * (len(h_stack) - 1)
1934 lines = []
1935 h_stack = [0] # stack of header-level numbers
1936 for level, id, name in self._toc:
1937 if level > h_stack[-1]:
1938 lines.append("%s<ul>" % indent())
1939 h_stack.append(level)
1940 elif level == h_stack[-1]:
1941 lines[-1] += "</li>"
1942 else:
1943 while level < h_stack[-1]:
1944 h_stack.pop()
1945 if not lines[-1].endswith("</li>"):
1946 lines[-1] += "</li>"
1947 lines.append("%s</ul></li>" % indent())
1948 lines.append('%s<li><a href="#%s">%s</a>' % (
1949 indent(), id, name))
1950 while len(h_stack) > 1:
1951 h_stack.pop()
1952 if not lines[-1].endswith("</li>"):
1953 lines[-1] += "</li>"
1954 lines.append("%s</ul>" % indent())
1955 return '\n'.join(lines) + '\n'
1956 toc_html = property(toc_html)
1957
1958## {{{ http://code.activestate.com/recipes/577257/ (r1)
1959_slugify_strip_re = re.compile(r'[^\w\s-]')
1960_slugify_hyphenate_re = re.compile(r'[-\s]+')
1961def _slugify(value):
1962 """
1963 Normalizes string, converts to lowercase, removes non-alpha characters,
1964 and converts spaces to hyphens.
1965
1966 From Django's "django/template/defaultfilters.py".
1967 """
1968 import unicodedata
1969 value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
1970 value = _slugify_strip_re.sub('', value).strip().lower()
1971 return _slugify_hyphenate_re.sub('-', value)
1972## end of http://code.activestate.com/recipes/577257/ }}}
1973
1974
1975# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
1976def _curry(*args, **kwargs):
1977 function, args = args[0], args[1:]
1978 def result(*rest, **kwrest):
1979 combined = kwargs.copy()
1980 combined.update(kwrest)
1981 return function(*args + rest, **combined)
1982 return result
1983
1984# Recipe: regex_from_encoded_pattern (1.0)
1985def _regex_from_encoded_pattern(s):
1986 """'foo' -> re.compile(re.escape('foo'))
1987 '/foo/' -> re.compile('foo')
1988 '/foo/i' -> re.compile('foo', re.I)
1989 """
1990 if s.startswith('/') and s.rfind('/') != 0:
1991 # Parse it: /PATTERN/FLAGS
1992 idx = s.rfind('/')
1993 pattern, flags_str = s[1:idx], s[idx+1:]
1994 flag_from_char = {
1995 "i": re.IGNORECASE,
1996 "l": re.LOCALE,
1997 "s": re.DOTALL,
1998 "m": re.MULTILINE,
1999 "u": re.UNICODE,
2000 }
2001 flags = 0
2002 for char in flags_str:
2003 try:
2004 flags |= flag_from_char[char]
2005 except KeyError:
2006 raise ValueError("unsupported regex flag: '%s' in '%s' "
2007 "(must be one of '%s')"
2008 % (char, s, ''.join(list(flag_from_char.keys()))))
2009 return re.compile(s[1:idx], flags)
2010 else: # not an encoded regex
2011 return re.compile(re.escape(s))
2012
2013# Recipe: dedent (0.1.2)
2014def _dedentlines(lines, tabsize=8, skip_first_line=False):
2015 """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
2016
2017 "lines" is a list of lines to dedent.
2018 "tabsize" is the tab width to use for indent width calculations.
2019 "skip_first_line" is a boolean indicating if the first line should
2020 be skipped for calculating the indent width and for dedenting.
2021 This is sometimes useful for docstrings and similar.
2022
2023 Same as dedent() except operates on a sequence of lines. Note: the
2024 lines list is modified **in-place**.
2025 """
2026 DEBUG = False
2027 if DEBUG:
2028 print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
2029 % (tabsize, skip_first_line))
2030 indents = []
2031 margin = None
2032 for i, line in enumerate(lines):
2033 if i == 0 and skip_first_line: continue
2034 indent = 0
2035 for ch in line:
2036 if ch == ' ':
2037 indent += 1
2038 elif ch == '\t':
2039 indent += tabsize - (indent % tabsize)
2040 elif ch in '\r\n':
2041 continue # skip all-whitespace lines
2042 else:
2043 break
2044 else:
2045 continue # skip all-whitespace lines
2046 if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
2047 if margin is None:
2048 margin = indent
2049 else:
2050 margin = min(margin, indent)
2051 if DEBUG: print("dedent: margin=%r" % margin)
2052
2053 if margin is not None and margin > 0:
2054 for i, line in enumerate(lines):
2055 if i == 0 and skip_first_line: continue
2056 removed = 0
2057 for j, ch in enumerate(line):
2058 if ch == ' ':
2059 removed += 1
2060 elif ch == '\t':
2061 removed += tabsize - (removed % tabsize)
2062 elif ch in '\r\n':
2063 if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)
2064 lines[i] = lines[i][j:]
2065 break
2066 else:
2067 raise ValueError("unexpected non-whitespace char %r in "
2068 "line %r while removing %d-space margin"
2069 % (ch, line, margin))
2070 if DEBUG:
2071 print("dedent: %r: %r -> removed %d/%d"\
2072 % (line, ch, removed, margin))
2073 if removed == margin:
2074 lines[i] = lines[i][j+1:]
2075 break
2076 elif removed > margin:
2077 lines[i] = ' '*(removed-margin) + lines[i][j+1:]
2078 break
2079 else:
2080 if removed:
2081 lines[i] = lines[i][removed:]
2082 return lines
2083
2084def _dedent(text, tabsize=8, skip_first_line=False):
2085 """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
2086
2087 "text" is the text to dedent.
2088 "tabsize" is the tab width to use for indent width calculations.
2089 "skip_first_line" is a boolean indicating if the first line should
2090 be skipped for calculating the indent width and for dedenting.
2091 This is sometimes useful for docstrings and similar.
2092
2093 textwrap.dedent(s), but don't expand tabs to spaces
2094 """
2095 lines = text.splitlines(1)
2096 _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
2097 return ''.join(lines)
2098
2099
2100class _memoized(object):
2101 """Decorator that caches a function's return value each time it is called.
2102 If called later with the same arguments, the cached value is returned, and
2103 not re-evaluated.
2104
2105 http://wiki.python.org/moin/PythonDecoratorLibrary
2106 """
2107 def __init__(self, func):
2108 self.func = func
2109 self.cache = {}
2110 def __call__(self, *args):
2111 try:
2112 return self.cache[args]
2113 except KeyError:
2114 self.cache[args] = value = self.func(*args)
2115 return value
2116 except TypeError:
2117 # uncachable -- for instance, passing a list as an argument.
2118 # Better to not cache than to blow up entirely.
2119 return self.func(*args)
2120 def __repr__(self):
2121 """Return the function's docstring."""
2122 return self.func.__doc__
2123
2124
2125def _xml_oneliner_re_from_tab_width(tab_width):
2126 """Standalone XML processing instruction regex."""
2127 return re.compile(r"""
2128 (?:
2129 (?<=\n\n) # Starting after a blank line
2130 | # or
2131 \A\n? # the beginning of the doc
2132 )
2133 ( # save in $1
2134 [ ]{0,%d}
2135 (?:
2136 <\?\w+\b\s+.*?\?> # XML processing instruction
2137 |
2138 <\w+:\w+\b\s+.*?/> # namespaced single tag
2139 )
2140 [ \t]*
2141 (?=\n{2,}|\Z) # followed by a blank line or end of document
2142 )
2143 """ % (tab_width - 1), re.X)
2144_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
2145
2146def _hr_tag_re_from_tab_width(tab_width):
2147 return re.compile(r"""
2148 (?:
2149 (?<=\n\n) # Starting after a blank line
2150 | # or
2151 \A\n? # the beginning of the doc
2152 )
2153 ( # save in \1
2154 [ ]{0,%d}
2155 <(hr) # start tag = \2
2156 \b # word break
2157 ([^<>])*? #
2158 /?> # the matching end tag
2159 [ \t]*
2160 (?=\n{2,}|\Z) # followed by a blank line or end of document
2161 )
2162 """ % (tab_width - 1), re.X)
2163_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
2164
2165
2166def _xml_escape_attr(attr, skip_single_quote=True):
2167 """Escape the given string for use in an HTML/XML tag attribute.
2168
2169 By default this doesn't bother with escaping `'` to `&#39;`, presuming that
2170 the tag attribute is surrounded by double quotes.
2171 """
2172 escaped = (attr
2173 .replace('&', '&amp;')
2174 .replace('"', '&quot;')
2175 .replace('<', '&lt;')
2176 .replace('>', '&gt;'))
2177 if not skip_single_quote:
2178 escaped = escaped.replace("'", "&#39;")
2179 return escaped
2180
2181
2182def _xml_encode_email_char_at_random(ch):
2183 r = random()
2184 # Roughly 10% raw, 45% hex, 45% dec.
2185 # '@' *must* be encoded. I [John Gruber] insist.
2186 # Issue 26: '_' must be encoded.
2187 if r > 0.9 and ch not in "@_":
2188 return ch
2189 elif r < 0.45:
2190 # The [1:] is to drop leading '0': 0x63 -> x63
2191 return '&#%s;' % hex(ord(ch))[1:]
2192 else:
2193 return '&#%s;' % ord(ch)
2194
2195
2196
2197#---- mainline
2198
2199class _NoReflowFormatter(optparse.IndentedHelpFormatter):
2200 """An optparse formatter that does NOT reflow the description."""
2201 def format_description(self, description):
2202 return description or ""
2203
2204def _test():
2205 import doctest
2206 doctest.testmod()
2207
2208def main(argv=None):
2209 if argv is None:
2210 argv = sys.argv
2211 if not logging.root.handlers:
2212 logging.basicConfig()
2213
2214 usage = "usage: %prog [PATHS...]"
2215 version = "%prog "+__version__
2216 parser = optparse.OptionParser(prog="markdown2", usage=usage,
2217 version=version, description=cmdln_desc,
2218 formatter=_NoReflowFormatter())
2219 parser.add_option("-v", "--verbose", dest="log_level",
2220 action="store_const", const=logging.DEBUG,
2221 help="more verbose output")
2222 parser.add_option("--encoding",
2223 help="specify encoding of text content")
2224 parser.add_option("--html4tags", action="store_true", default=False,
2225 help="use HTML 4 style for empty element tags")
2226 parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",
2227 help="sanitize literal HTML: 'escape' escapes "
2228 "HTML meta chars, 'replace' replaces with an "
2229 "[HTML_REMOVED] note")
2230 parser.add_option("-x", "--extras", action="append",
2231 help="Turn on specific extra features (not part of "
2232 "the core Markdown spec). See above.")
2233 parser.add_option("--use-file-vars",
2234 help="Look for and use Emacs-style 'markdown-extras' "
2235 "file var to turn on extras. See "
2236 "<https://github.com/trentm/python-markdown2/wiki/Extras>")
2237 parser.add_option("--link-patterns-file",
2238 help="path to a link pattern file")
2239 parser.add_option("--self-test", action="store_true",
2240 help="run internal self-tests (some doctests)")
2241 parser.add_option("--compare", action="store_true",
2242 help="run against Markdown.pl as well (for testing)")
2243 parser.set_defaults(log_level=logging.INFO, compare=False,
2244 encoding="utf-8", safe_mode=None, use_file_vars=False)
2245 opts, paths = parser.parse_args()
2246 log.setLevel(opts.log_level)
2247
2248 if opts.self_test:
2249 return _test()
2250
2251 if opts.extras:
2252 extras = {}
2253 for s in opts.extras:
2254 splitter = re.compile("[,;: ]+")
2255 for e in splitter.split(s):
2256 if '=' in e:
2257 ename, earg = e.split('=', 1)
2258 try:
2259 earg = int(earg)
2260 except ValueError:
2261 pass
2262 else:
2263 ename, earg = e, None
2264 extras[ename] = earg
2265 else:
2266 extras = None
2267
2268 if opts.link_patterns_file:
2269 link_patterns = []
2270 f = open(opts.link_patterns_file)
2271 try:
2272 for i, line in enumerate(f.readlines()):
2273 if not line.strip(): continue
2274 if line.lstrip().startswith("#"): continue
2275 try:
2276 pat, href = line.rstrip().rsplit(None, 1)
2277 except ValueError:
2278 raise MarkdownError("%s:%d: invalid link pattern line: %r"
2279 % (opts.link_patterns_file, i+1, line))
2280 link_patterns.append(
2281 (_regex_from_encoded_pattern(pat), href))
2282 finally:
2283 f.close()
2284 else:
2285 link_patterns = None
2286
2287 from os.path import join, dirname, abspath, exists
2288 markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
2289 "Markdown.pl")
2290 if not paths:
2291 paths = ['-']
2292 for path in paths:
2293 if path == '-':
2294 text = sys.stdin.read()
2295 else:
2296 fp = codecs.open(path, 'r', opts.encoding)
2297 text = fp.read()
2298 fp.close()
2299 if opts.compare:
2300 from subprocess import Popen, PIPE
2301 print("==== Markdown.pl ====")
2302 p = Popen('perl %s' % markdown_pl, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True)
2303 p.stdin.write(text.encode('utf-8'))
2304 p.stdin.close()
2305 perl_html = p.stdout.read().decode('utf-8')
2306 if py3:
2307 sys.stdout.write(perl_html)
2308 else:
2309 sys.stdout.write(perl_html.encode(
2310 sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2311 print("==== markdown2.py ====")
2312 html = markdown(text,
2313 html4tags=opts.html4tags,
2314 safe_mode=opts.safe_mode,
2315 extras=extras, link_patterns=link_patterns,
2316 use_file_vars=opts.use_file_vars)
2317 if py3:
2318 sys.stdout.write(html)
2319 else:
2320 sys.stdout.write(html.encode(
2321 sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2322 if extras and "toc" in extras:
2323 log.debug("toc_html: " +
2324 html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2325 if opts.compare:
2326 test_dir = join(dirname(dirname(abspath(__file__))), "test")
2327 if exists(join(test_dir, "test_markdown2.py")):
2328 sys.path.insert(0, test_dir)
2329 from test_markdown2 import norm_html_from_html
2330 norm_html = norm_html_from_html(html)
2331 norm_perl_html = norm_html_from_html(perl_html)
2332 else:
2333 norm_html = html
2334 norm_perl_html = perl_html
2335 print("==== match? %r ====" % (norm_perl_html == norm_html))
2336
2337
2338if __name__ == "__main__":
2339 sys.exit( main(sys.argv) )

Archive Download this file

Branches

Tags