Update Markdown parser from 2.1.1 to 2.3.2

2020-03-20 16:55:45 +01:00 · 2020-03-20 16:55:45 +01:00 · 7cb4f1d3d7
parent 4b642fa48a
commit 7cb4f1d3d7
1 changed files with 365 additions and 180 deletions
--- a/dynastie/generators/markdown2.py
+++ b/dynastie/generators/markdown2.py
@ -53,8 +53,9 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
 * header-ids: Adds "id" attributes to headers. The id value is a slug of
  the header text.
 * html-classes: Takes a dict mapping html tag names (lowercase) to a
-  string to use for a "class" tag attribute. Currently only supports
-  "pre" and "code" tags. Add an issue if you require this for other tags.
+  string to use for a "class" tag attribute. Currently only supports "img",
+  "table", "pre" and "code" tags. Add an issue if you require this for other
+  tags.
 * markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
  have markdown processing be done on its contents. Similar to
  <http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
@ -70,9 +71,14 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
 * smarty-pants: Replaces ' and " with curly quotation marks or curly
  apostrophes.  Replaces --, ---, ..., and . . . with en dashes, em dashes,
  and ellipses.
+* spoiler: A special kind of blockquote commonly hidden behind a
+  click on SO. Syntax per <http://meta.stackexchange.com/a/72878>.
 * toc: The returned HTML string gets a new "toc_html" attribute which is
  a Table of Contents for the document. (experimental)
 * xml: Passes one-liner processing instructions and namespaced XML tags.
+* tables: Tables using the same format as GFM
+  <https://help.github.com/articles/github-flavored-markdown#tables> and
+  PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>.
 * wiki-tables: Google Code Wiki-style tables. See
  <http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
 """
@ -82,13 +88,11 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
 #   not yet sure if there implications with this. Compare 'pydoc sre'
 #   and 'perldoc perlre'.

-__version_info__ = (2, 1, 1)
+__version_info__ = (2, 3, 2)
 __version__ = '.'.join(map(str, __version_info__))
 __author__ = "Trent Mick"

-import os
 import sys
-from pprint import pprint
 import re
 import logging
 try:
@ -100,15 +104,9 @@ from random import random, randint
 import codecs


-#---- Python version compat
+# ---- Python version compat

-try:
-    from urllib.parse import quote # python3
-except ImportError:
-    from urllib import quote # python2
-
-if sys.version_info[:2] < (2,4):
-    from sets import Set as set
+if sys.version_info[:2] < (2, 4):
    def reversed(sequence):
        for i in sequence[::-1]:
            yield i
@ -127,8 +125,7 @@ elif sys.version_info[0] >= 3:
    base_string_type = str


-
-#---- globals
+# ---- globals

 DEBUG = False
 log = logging.getLogger("markdown")
@ -145,15 +142,12 @@ g_escape_table = dict([(ch, _hash_text(ch))
    for ch in '\\`*_{}[]()>#+-.!'])


-
-#---- exceptions
-
+# ---- exceptions
 class MarkdownError(Exception):
    pass


-
-#---- public api
+# ---- public api

 def markdown_path(path, encoding="utf-8",
                  html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
@ -167,6 +161,7 @@ def markdown_path(path, encoding="utf-8",
                    link_patterns=link_patterns,
                    use_file_vars=use_file_vars).convert(text)

+
 def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
             safe_mode=None, extras=None, link_patterns=None,
             use_file_vars=False):
@ -175,6 +170,7 @@ def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
                    link_patterns=link_patterns,
                    use_file_vars=use_file_vars).convert(text)

+
 class Markdown(object):
    # The dict of "extras" to enable in processing -- a mapping of
    # extra name to argument for the extra. Most extras do not have an
@ -222,7 +218,7 @@ class Markdown(object):
                extras = dict([(e, None) for e in extras])
            self.extras.update(extras)
        assert isinstance(self.extras, dict)
-        if "toc" in self.extras and not "header-ids" in self.extras:
+        if "toc" in self.extras and "header-ids" not in self.extras:
            self.extras["header-ids"] = None   # "toc" implies "header-ids"
        self._instance_extras = self.extras.copy()

@ -254,6 +250,11 @@ class Markdown(object):
    # should only be used in <a> tags with an "href" attribute.
    _a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE)

+    # Opens the linked document in a new window or tab
+    # should only used in <a> tags with an "target" attribute.
+    # same with _a_nofollow
+    _a_blank = _a_nofollow
+
    def convert(self, text):
        """Convert the given text."""
        # Main function. The order in which other subs are called here is
@ -268,7 +269,7 @@ class Markdown(object):
        self.reset()

        if not isinstance(text, unicode):
-            #TODO: perhaps shouldn't presume UTF-8 for string input?
+            # TODO: perhaps shouldn't presume UTF-8 for string input?
            text = unicode(text, 'utf-8')

        if self.use_file_vars:
@ -288,7 +289,8 @@ class Markdown(object):
                    self.extras[ename] = earg

        # Standardize line endings:
-        text = re.sub("\r\n|\r", "\n", text)
+        text = text.replace("\r\n", "\n")
+        text = text.replace("\r", "\n")

        # Make sure $text ends with a couple of newlines:
        text += "\n\n"
@ -308,13 +310,16 @@ class Markdown(object):

        text = self.preprocess(text)

+        if "fenced-code-blocks" in self.extras and not self.safe_mode:
+            text = self._do_fenced_code_blocks(text)
+
        if self.safe_mode:
            text = self._hash_html_spans(text)

        # Turn block-level HTML blocks into hash entries
        text = self._hash_html_blocks(text, raw=True)

-        if "fenced-code-blocks" in self.extras:
+        if "fenced-code-blocks" in self.extras and self.safe_mode:
            text = self._do_fenced_code_blocks(text)

        # Strip link definitions, store in hashes.
@ -340,6 +345,9 @@ class Markdown(object):
        if "nofollow" in self.extras:
            text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text)

+        if "target-blank-links" in self.extras:
+            text = self._a_blank.sub(r'<\1 target="_blank"\2', text)
+
        text += "\n"

        rv = UnicodeWithAttrs(text)
@ -363,18 +371,29 @@ class Markdown(object):
        """
        return text

-    # Is metadata if the content starts with '---'-fenced `key: value`
+    # Is metadata if the content starts with optional '---'-fenced `key: value`
    # pairs. E.g. (indented for presentation):
    #   ---
    #   foo: bar
    #   another-var: blah blah
    #   ---
-    _metadata_pat = re.compile("""^---[ \t]*\n((?:[ \t]*[^ \t:]+[ \t]*:[^\n]*\n)+)---[ \t]*\n""")
+    #   # header
+    # or:
+    #   foo: bar
+    #   another-var: blah blah
+    #   
+    #   # header
+
+    _metadata_pat = re.compile(r"""
+        ^
+        (?:---[\ \t]*\n)?                       # optional "---"
+        ((?:[ \t]*[^ \t:]+[\ \t]*:[^\n]*\n)+)   # "key: value" pairs
+        (?:---[ \t]*)?                          # optional "---"
+        \n""",
+        re.VERBOSE
+    )

    def _extract_metadata(self, text):
-        # fast test
-        if not text.startswith("---"):
-            return text
        match = self._metadata_pat.match(text)
        if not match:
            return text
@ -387,7 +406,6 @@ class Markdown(object):

        return tail

-
    _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
    # This regular expression is intended to match blocks like this:
    #    PREFIX Local Variables: SUFFIX
@ -448,7 +466,7 @@ class Markdown(object):
                prefix = match.group("prefix")
                suffix = match.group("suffix")
                lines = match.group("content").splitlines(0)
-                #print "prefix=%r, suffix=%r, content=%r, lines: %s"\
+                # print "prefix=%r, suffix=%r, content=%r, lines: %s"\
                #      % (prefix, suffix, match.group("content"), lines)

                # Validate the Local Variables block: proper prefix and suffix
@ -505,14 +523,19 @@ class Markdown(object):

        return emacs_vars

-    # Cribbed from a post by Bart Lateur:
-    # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
-    _detab_re = re.compile(r'(.*?)\t', re.M)
-    def _detab_sub(self, match):
-        g1 = match.group(1)
-        return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
+    def _detab_line(self, line):
+        r"""Recusively convert tabs to spaces in a single line.
+
+        Called from _detab()."""
+        if '\t' not in line:
+            return line
+        chunk1, chunk2 = line.split('\t', 1)
+        chunk1 += (' ' * (self.tab_width - len(chunk1) % self.tab_width))
+        output = chunk1 + chunk2
+        return self._detab_line(output)
+
    def _detab(self, text):
-        r"""Remove (leading?) tabs from a file.
+        r"""Iterate text line by line and convert tabs to spaces.

            >>> m = Markdown()
            >>> m._detab("\tfoo")
@ -528,7 +551,10 @@ class Markdown(object):
        """
        if '\t' not in text:
            return text
-        return self._detab_re.subn(self._detab_sub, text)[0]
+        output = []
+        for line in text.splitlines():
+            output.append(self._detab_line(line))
+        return '\n'.join(output)

    # I broke out the html5 tags here and add them to _block_tags_a and
    # _block_tags_b.  This way html5 tags are easy to keep track of.
@ -776,12 +802,7 @@ class Markdown(object):
            re.X | re.M)
        return footnote_def_re.sub(self._extract_footnote_def_sub, text)

-
-    _hr_data = [
-        ('*', re.compile(r"^[ ]{0,3}\*(.*?)$", re.M)),
-        ('-', re.compile(r"^[ ]{0,3}\-(.*?)$", re.M)),
-        ('_', re.compile(r"^[ ]{0,3}\_(.*?)$", re.M)),
-    ]
+    _hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M)

    def _run_block_gamut(self, text):
        # These are all the transformations that form block-level
@ -798,13 +819,7 @@ class Markdown(object):
        # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
        # hr chars to one or two. We'll reproduce that limit here.
        hr = "\n<hr"+self.empty_element_suffix+"\n"
-        for ch, regex in self._hr_data:
-            if ch in text:
-                for m in reversed(list(regex.finditer(text))):
-                    tail = m.group(1).rstrip()
-                    if not tail.strip(ch + ' ') and tail.count("   ") == 0:
-                        start, end = m.span()
-                        text = text[:start] + hr + text[end:]
+        text = re.sub(self._hr_re, hr, text)

        text = self._do_lists(text)

@ -812,6 +827,8 @@ class Markdown(object):
            text = self._prepare_pyshell_blocks(text)
        if "wiki-tables" in self.extras:
            text = self._do_wiki_tables(text)
+        if "tables" in self.extras:
+            text = self._do_tables(text)

        text = self._do_code_blocks(text)

@ -852,16 +869,89 @@ class Markdown(object):

        return _pyshell_block_re.sub(self._pyshell_block_sub, text)

+    def _table_sub(self, match):
+        trim_space_re = '^[ \t\n]+|[ \t\n]+$'
+        trim_bar_re = '^\||\|$'
+
+        head, underline, body = match.groups()
+
+        # Determine aligns for columns.
+        cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)).split('|')]
+        align_from_col_idx = {}
+        for col_idx, col in enumerate(cols):
+            if col[0] == ':' and col[-1] == ':':
+                align_from_col_idx[col_idx] = ' align="center"'
+            elif col[0] == ':':
+                align_from_col_idx[col_idx] = ' align="left"'
+            elif col[-1] == ':':
+                align_from_col_idx[col_idx] = ' align="right"'
+
+        # thead
+        hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<thead>', '<tr>']
+        cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)).split('|')]
+        for col_idx, col in enumerate(cols):
+            hlines.append('  <th%s>%s</th>' % (
+                align_from_col_idx.get(col_idx, ''),
+                self._run_span_gamut(col)
+            ))
+        hlines.append('</tr>')
+        hlines.append('</thead>')
+
+        # tbody
+        hlines.append('<tbody>')
+        for line in body.strip('\n').split('\n'):
+            hlines.append('<tr>')
+            cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)).split('|')]
+            for col_idx, col in enumerate(cols):
+                hlines.append('  <td%s>%s</td>' % (
+                    align_from_col_idx.get(col_idx, ''),
+                    self._run_span_gamut(col)
+                ))
+            hlines.append('</tr>')
+        hlines.append('</tbody>')
+        hlines.append('</table>')
+
+        return '\n'.join(hlines) + '\n'
+
+    def _do_tables(self, text):
+        """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
+        https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
+        """
+        less_than_tab = self.tab_width - 1
+        table_re = re.compile(r'''
+                (?:(?<=\n\n)|\A\n?)             # leading blank line
+
+                ^[ ]{0,%d}                      # allowed whitespace
+                (.*[|].*)  \n                   # $1: header row (at least one pipe)
+
+                ^[ ]{0,%d}                      # allowed whitespace
+                (                               # $2: underline row
+                    # underline row with leading bar
+                    (?:  \|\ *:?-+:?\ *  )+  \|?  \n
+                    |
+                    # or, underline row without leading bar
+                    (?:  \ *:?-+:?\ *\|  )+  (?:  \ *:?-+:?\ *  )?  \n
+                )
+
+                (                               # $3: data rows
+                    (?:
+                        ^[ ]{0,%d}(?!\ )         # ensure line begins with 0 to less_than_tab spaces
+                        .*\|.*  \n
+                    )+
+                )
+            ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
+        return table_re.sub(self._table_sub, text)
+
    def _wiki_table_sub(self, match):
        ttext = match.group(0).strip()
-        #print 'wiki table: %r' % match.group(0)
+        # print 'wiki table: %r' % match.group(0)
        rows = []
        for line in ttext.splitlines(0):
            line = line.strip()[2:-2].strip()
            row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
            rows.append(row)
-        #pprint(rows)
-        hlines = ['<table>', '<tbody>']
+        # pprint(rows)
+        hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<tbody>']
        for row in rows:
            hrow = ['<tr>']
            for cell in row:
@ -907,12 +997,18 @@ class Markdown(object):

        text = self._encode_amps_and_angles(text)

+        if "strike" in self.extras:
+            text = self._do_strike(text)
+
        text = self._do_italics_and_bold(text)

        if "smarty-pants" in self.extras:
            text = self._do_smart_punctuation(text)

        # Do hard breaks:
+        if "break-on-newline" in self.extras:
+            text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text)
+        else:
            text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)

        return text
@ -1003,22 +1099,14 @@ class Markdown(object):
            raise MarkdownError("invalid value for 'safe_mode': %r (must be "
                                "'escape' or 'replace')" % self.safe_mode)

-    _tail_of_inline_link_re = re.compile(r'''
-          # Match tail of: [text](/url/) or [text](/url/ "title")
-          \(            # literal paren
-            [ \t]*
-            (?P<url>            # \1
-                <.*?>
-                |
-                .*?
-            )
-            [ \t]*
-            (                   # \2
-              (['"])            # quote char = \3
+    _inline_link_title = re.compile(r'''
+            (                   # \1
+              [ \t]+
+              (['"])            # quote char = \2
              (?P<title>.*?)
-              \3                # matching quote
+              \2
            )?                  # title is optional
-          \)
+          \)$
        ''', re.X | re.S)
    _tail_of_reference_link_re = re.compile(r'''
          # Match tail of: [text][id]
@ -1029,6 +1117,52 @@ class Markdown(object):
          \]
        ''', re.X | re.S)

+    _whitespace = re.compile(r'\s*')
+
+    _strip_anglebrackets = re.compile(r'<(.*)>.*')
+
+    def _find_non_whitespace(self, text, start):
+        """Returns the index of the first non-whitespace character in text
+        after (and including) start
+        """
+        match = self._whitespace.match(text, start)
+        return match.end()
+
+    def _find_balanced(self, text, start, open_c, close_c):
+        """Returns the index where the open_c and close_c characters balance
+        out - the same number of open_c and close_c are encountered - or the
+        end of string if it's reached before the balance point is found.
+        """
+        i = start
+        l = len(text)
+        count = 1
+        while count > 0 and i < l:
+            if text[i] == open_c:
+                count += 1
+            elif text[i] == close_c:
+                count -= 1
+            i += 1
+        return i
+
+    def _extract_url_and_title(self, text, start):
+        """Extracts the url and (optional) title from the tail of a link"""
+        # text[start] equals the opening parenthesis
+        idx = self._find_non_whitespace(text, start+1)
+        if idx == len(text):
+            return None, None, None
+        end_idx = idx
+        has_anglebrackets = text[idx] == "<"
+        if has_anglebrackets:
+            end_idx = self._find_balanced(text, end_idx+1, "<", ">")
+        end_idx = self._find_balanced(text, end_idx, "(", ")")
+        match = self._inline_link_title.search(text, idx, end_idx)
+        if not match:
+            return None, None, None
+        url, title = text[idx:match.start()], match.group("title")
+        if has_anglebrackets:
+            url = self._strip_anglebrackets.sub(r'\1', url)
+        return url, title, end_idx
+
    def _do_links(self, text):
        """Turn Markdown link shortcuts into XHTML <a> and <img> tags.

@ -1111,8 +1245,8 @@ class Markdown(object):

            # Inline anchor or img?
            if text[p] == '(':  # attempt at perf improvement
-                match = self._tail_of_inline_link_re.match(text, p)
-                if match:
+                url, title, url_end_idx = self._extract_url_and_title(text, p)
+                if url is not None:
                    # Handle an inline anchor or img.
                    is_img = start_idx > 0 and text[start_idx-1] == "!"
                    if is_img:
@ -1123,9 +1257,6 @@ class Markdown(object):
                        start_idx -= 1
                        is_img = 1

-                    url, title = match.group("url"), match.group("title")
-                    if url and url[0] == '<':
-                        url = url[1:-1]  # '<url>' -> 'url'
                    # We've got to encode these to avoid conflicting
                    # with italics/bold.
                    url = url.replace('*', self._escape_table['*']) \
@ -1138,20 +1269,17 @@ class Markdown(object):
                    else:
                        title_str = ''
                    if is_img:
+                        img_class_str = self._html_class_str_from_tag("img")
                        if is_inline_img:
-				result = '<img class="inlineimage" src="%s" alt="%s"%s%s' \
+                            img_class_str = ' class="inlineimage"'
+                        result = '<img src="%s" alt="%s"%s%s%s' \
                                 % (url.replace('"', '&quot;'),
                                    _xml_escape_attr(link_text),
-				       title_str, self.empty_element_suffix)
-                        else:
-                            result = '<img src="%s" alt="%s"%s%s' \
-				% (url.replace('"', '&quot;'),
-				   _xml_escape_attr(link_text),
-				   title_str, self.empty_element_suffix)
+                                    title_str, img_class_str, self.empty_element_suffix)
                        if "smarty-pants" in self.extras:
                            result = result.replace('"', self._escape_table['"'])
                        curr_pos = start_idx + len(result)
-                        text = text[:start_idx] + result + text[match.end():]
+                        text = text[:start_idx] + result + text[url_end_idx:]
                    elif start_idx >= anchor_allowed_pos:
                        result_head = '<a href="%s"%s>' % (url, title_str)
                        result = '%s%s</a>' % (result_head, link_text)
@ -1161,7 +1289,7 @@ class Markdown(object):
                        # anchor_allowed_pos on.
                        curr_pos = start_idx + len(result_head)
                        anchor_allowed_pos = start_idx + len(result)
-                        text = text[:start_idx] + result + text[match.end():]
+                        text = text[:start_idx] + result + text[url_end_idx:]
                    else:
                        # Anchor not allowed here.
                        curr_pos = start_idx + 1
@ -1186,7 +1314,6 @@ class Markdown(object):
                                 .replace('_', self._escape_table['_'])
                        title = self.titles.get(link_id)
                        if title:
-                            before = title
                            title = _xml_escape_attr(title) \
                                .replace('*', self._escape_table['*']) \
                                .replace('_', self._escape_table['_'])
@ -1194,10 +1321,11 @@ class Markdown(object):
                        else:
                            title_str = ''
                        if is_img:
-                            result = '<img src="%s" alt="%s"%s%s' \
+                            img_class_str = self._html_class_str_from_tag("img")
+                            result = '<img src="%s" alt="%s"%s%s%s' \
                                % (url.replace('"', '&quot;'),
                                   link_text.replace('"', '&quot;'),
-                                   title_str, self.empty_element_suffix)
+                                   title_str, img_class_str, self.empty_element_suffix)
                            if "smarty-pants" in self.extras:
                                result = result.replace('"', self._escape_table['"'])
                            curr_pos = start_idx + len(result)
@ -1258,44 +1386,42 @@ class Markdown(object):
            self._toc = []
        self._toc.append((level, id, self._unescape_special_chars(name)))

-    _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
-    def _setext_h_sub(self, match):
-        n = {"=": 1, "-": 2}[match.group(2)[0]]
-        demote_headers = self.extras.get("demote-headers")
-        if demote_headers:
-            n = min(n + demote_headers, 6)
-        header_id_attr = ""
-        if "header-ids" in self.extras:
-            header_id = self.header_id_from_text(match.group(1),
-                self.extras["header-ids"], n)
-            if header_id:
-                header_id_attr = ' id="%s"' % header_id
-        html = self._run_span_gamut(match.group(1))
-        if "toc" in self.extras and header_id:
-            self._toc_add_entry(n, header_id, html)
-        return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
-
-    _atx_h_re = re.compile(r'''
-        ^(\#{1,6})  # \1 = string of #'s
-        [ \t]+
+    _h_re_base = r'''
+        (^(.+)[ \t]*\n(=+|-+)[ \t]*\n+)
+        |
+        (^(\#{1,6})  # \1 = string of #'s
+        [ \t]%s
        (.+?)       # \2 = Header text
        [ \t]*
        (?<!\\)     # ensure not an escaped trailing '#'
        \#*         # optional closing #'s (not counted)
        \n+
-        ''', re.X | re.M)
-    def _atx_h_sub(self, match):
-        n = len(match.group(1))
+        )
+        '''
+
+    _h_re = re.compile(_h_re_base % '*', re.X | re.M)
+    _h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)
+
+    def _h_sub(self, match):
+        if match.group(1) is not None:
+            # Setext header
+            n = {"=": 1, "-": 2}[match.group(3)[0]]
+            header_group = match.group(2)
+        else:
+            # atx header
+            n = len(match.group(5))
+            header_group = match.group(6)
+
        demote_headers = self.extras.get("demote-headers")
        if demote_headers:
            n = min(n + demote_headers, 6)
        header_id_attr = ""
        if "header-ids" in self.extras:
-            header_id = self.header_id_from_text(match.group(2),
+            header_id = self.header_id_from_text(header_group,
                self.extras["header-ids"], n)
            if header_id:
                header_id_attr = ' id="%s"' % header_id
-        html = self._run_span_gamut(match.group(2))
+        html = self._run_span_gamut(header_group)
        if "toc" in self.extras and header_id:
            self._toc_add_entry(n, header_id, html)
        return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
@ -1307,7 +1433,6 @@ class Markdown(object):
        #
        #     Header 2
        #     --------
-        text = self._setext_h_re.sub(self._setext_h_sub, text)

        # atx-style headers:
        #   # Header 1
@ -1315,10 +1440,10 @@ class Markdown(object):
        #   ## Header 2 with closing hashes ##
        #   ...
        #   ###### Header 6
-        text = self._atx_h_re.sub(self._atx_h_sub, text)
-
-        return text

+        if 'tag-friendly' in self.extras:
+            return self._h_re_tag_friendly.sub(self._h_sub, text)
+        return self._h_re.sub(self._h_sub, text)

    _marker_ul_chars = '*+-'
    _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
@ -1380,8 +1505,9 @@ class Markdown(object):
            hits.sort()
            match = hits[0][1]
            start, end = match.span()
-            text = text[:start] + self._list_sub(match) + text[end:]
-            pos = end
+            middle = self._list_sub(match)
+            text = text[:start] + middle + text[end:]
+            pos = start + len(middle)  # start pos for next attempted match

        return text

@ -1395,11 +1521,25 @@ class Markdown(object):
        ''' % (_marker_any, _marker_any),
        re.M | re.X | re.S)

+    _task_list_item_re = re.compile(r'''
+        (\[[\ x]\])[ \t]+       # tasklist marker = \1
+        (.*)                   # list item text = \2
+    ''', re.M | re.X | re.S)
+
+    _task_list_warpper_str = r'<p><input type="checkbox" class="task-list-item-checkbox" %sdisabled>%s</p>'
+
+    def _task_list_item_sub(self, match):
+        marker = match.group(1)
+        item_text = match.group(2)
+        if marker == '[x]':
+                return self._task_list_warpper_str % ('checked ', item_text)
+        elif marker == '[ ]':
+                return self._task_list_warpper_str % ('', item_text)
+
    _last_li_endswith_two_eols = False
    def _list_item_sub(self, match):
        item = match.group(4)
        leading_line = match.group(1)
-        leading_space = match.group(2)
        if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
            item = self._run_block_gamut(self._outdent(item))
        else:
@ -1409,6 +1549,10 @@ class Markdown(object):
                item = item[:-1]
            item = self._run_span_gamut(item)
        self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
+
+        if "task_list" in self.extras:
+            item = self._task_list_item_re.sub(self._task_list_item_sub, item)
+
        return "<li>%s</li>\n" % item

    def _process_list_items(self, list_str):
@ -1497,8 +1641,20 @@ class Markdown(object):
                formatter_opts = self.extras['code-color'] or {}

        if lexer_name:
+            def unhash_code(codeblock):
+                for key, sanitized in list(self.html_spans.items()):
+                    codeblock = codeblock.replace(key, sanitized)
+                replacements = [
+                    ("&amp;", "&"),
+                    ("&lt;", "<"),
+                    ("&gt;", ">")
+                ]
+                for old, new in replacements:
+                    codeblock = codeblock.replace(old, new)
+                return codeblock
            lexer = self._get_pygments_lexer(lexer_name)
            if lexer:
+                codeblock = unhash_code( codeblock )
                colored = self._color_with_pygments(codeblock, lexer,
                                                    **formatter_opts)
                return "\n\n%s\n\n" % colored
@ -1535,19 +1691,22 @@ class Markdown(object):
              )+
            )
            ((?=^[ ]{0,%d}\S)|\Z)   # Lookahead for non-space at line-start, or end of doc
+            # Lookahead to make sure this block isn't already in a code block.
+            # Needed when syntax highlighting is being used.
+            (?![^<]*\</code\>)
            ''' % (self.tab_width, self.tab_width),
            re.M | re.X)
        return code_block_re.sub(self._code_block_sub, text)

    _fenced_code_block_re = re.compile(r'''
-        (?:\n\n|\A\n?)
+        (?:\n+|\A\n?)
        ^```([\w+-]+)?[ \t]*\n      # opening fence, $1 = optional lang
        (.*?)                       # $2 = code block content
        ^```[ \t]*\n                # closing fence
        ''', re.M | re.X | re.S)

    def _fenced_code_block_sub(self, match):
-        return self._code_block_sub(match, is_fenced_code_block=True);
+        return self._code_block_sub(match, is_fenced_code_block=True)

    def _do_fenced_code_blocks(self, text):
        """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
@ -1620,12 +1779,17 @@ class Markdown(object):
        self._escape_table[text] = hashed
        return hashed

+    _strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S)
+    def _do_strike(self, text):
+        text = self._strike_re.sub(r"<strike>\1</strike>", text)
+        return text
+
    _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
    _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
-    _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
-    _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
    _code_friendly_line_re = re.compile(r"\~\~(?=\S)(.+?)(?<=\S)\~\~", re.S)
    _code_friendly_underline_re = re.compile(r"\~(?=\S)(.+?)(?<=\S)\~", re.S)
+    _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
+    _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
    def _do_italics_and_bold(self, text):
        # <strong> must go first:
        if "code-friendly" in self.extras:
@ -1686,37 +1850,52 @@ class Markdown(object):
        text = text.replace(". . .", "&#8230;")
        return text

-    _block_quote_re = re.compile(r'''
+    _block_quote_base = r'''
        (                           # Wrap whole match in \1
          (
-            ^[ \t]*>[ \t]?          # '>' at the start of a line
+            ^[ \t]*>%s[ \t]?        # '>' at the start of a line
              .+\n                  # rest of the first line
            (.+\n)*                 # subsequent consecutive lines
            \n*                     # blanks
          )+
        )
-        ''', re.M | re.X)
-    _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
-
+    '''
+    _block_quote_re = re.compile(_block_quote_base % '', re.M | re.X)
+    _block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M | re.X)
+    _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M)
+    _bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M)
+    _bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M)
    _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
    def _dedent_two_spaces_sub(self, match):
        return re.sub(r'(?m)^  ', '', match.group(1))

    def _block_quote_sub(self, match):
        bq = match.group(1)
-        bq = self._bq_one_level_re.sub('', bq)  # trim one level of quoting
-        bq = self._ws_only_line_re.sub('', bq)  # trim whitespace-only lines
+        is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq)
+        # trim one level of quoting
+        if is_spoiler:
+            bq = self._bq_one_level_re_spoiler.sub('', bq)
+        else:
+            bq = self._bq_one_level_re.sub('', bq)
+        # trim whitespace-only lines
+        bq = self._ws_only_line_re.sub('', bq)
        bq = self._run_block_gamut(bq)          # recurse

        bq = re.sub('(?m)^', '  ', bq)
        # These leading spaces screw with <pre> content, so we need to fix that:
        bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)

-        return "<blockquote>\n%s\n</blockquote>\n\n" % bq
+        if is_spoiler:
+            return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq
+        else:
+            return '<blockquote>\n%s\n</blockquote>\n\n' % bq

    def _do_block_quotes(self, text):
        if '>' not in text:
            return text
+        if 'spoiler' in self.extras:
+            return self._block_quote_re_spoiler.sub(self._block_quote_sub, text)
+        else:
            return self._block_quote_re.sub(self._block_quote_sub, text)

    def _form_paragraphs(self, text):
@ -1774,7 +1953,7 @@ class Markdown(object):
                    '&#8617;</a>' % (id, i+1))
                if footer[-1].endswith("</p>"):
                    footer[-1] = footer[-1][:-len("</p>")] \
-                        + '&nbsp;' + backlink + "</p>"
+                        + '&#160;' + backlink + "</p>"
                else:
                    footer.append("\n<p>%s</p>" % backlink)
                footer.append('</li>')
@ -1910,7 +2089,7 @@ class MarkdownWithExtras(Markdown):
    extras = ["footnotes", "code-color"]


-#---- internal support functions
+# ---- internal support functions

 class UnicodeWithAttrs(unicode):
    """A subclass of unicode used for the return value of conversion to
@ -1979,6 +2158,7 @@ def _curry(*args, **kwargs):
        return function(*args + rest, **combined)
    return result

+
 # Recipe: regex_from_encoded_pattern (1.0)
 def _regex_from_encoded_pattern(s):
    """'foo'    -> re.compile(re.escape('foo'))
@ -2008,6 +2188,7 @@ def _regex_from_encoded_pattern(s):
    else:  # not an encoded regex
        return re.compile(re.escape(s))

+
 # Recipe: dedent (0.1.2)
 def _dedentlines(lines, tabsize=8, skip_first_line=False):
    """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
@ -2025,7 +2206,6 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
    if DEBUG:
        print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
              % (tabsize, skip_first_line))
-    indents = []
    margin = None
    for i, line in enumerate(lines):
        if i == 0 and skip_first_line: continue
@ -2079,6 +2259,7 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
                    lines[i] = lines[i][removed:]
    return lines

+
 def _dedent(text, tabsize=8, skip_first_line=False):
    """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text

@ -2105,6 +2286,7 @@ class _memoized(object):
    def __init__(self, func):
        self.func = func
        self.cache = {}
+
    def __call__(self, *args):
        try:
            return self.cache[args]
@ -2115,6 +2297,7 @@ class _memoized(object):
            # uncachable -- for instance, passing a list as an argument.
            # Better to not cache than to blow up entirely.
            return self.func(*args)
+
    def __repr__(self):
        """Return the function's docstring."""
        return self.func.__doc__
@ -2141,6 +2324,7 @@ def _xml_oneliner_re_from_tab_width(tab_width):
        """ % (tab_width - 1), re.X)
 _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)

+
 def _hr_tag_re_from_tab_width(tab_width):
    return re.compile(r"""
        (?:
@ -2191,18 +2375,19 @@ def _xml_encode_email_char_at_random(ch):
        return '&#%s;' % ord(ch)


-
-#---- mainline
+# ---- mainline

 class _NoReflowFormatter(optparse.IndentedHelpFormatter):
    """An optparse formatter that does NOT reflow the description."""
    def format_description(self, description):
        return description or ""

+
 def _test():
    import doctest
    doctest.testmod()

+
 def main(argv=None):
    if argv is None:
        argv = sys.argv
@ -2319,7 +2504,7 @@ def main(argv=None):
                sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
        if extras and "toc" in extras:
            log.debug("toc_html: " +
-                html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
+                str(html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')))
        if opts.compare:
            test_dir = join(dirname(dirname(abspath(__file__))), "test")
            if exists(join(test_dir, "test_markdown2.py")):
@ -2334,4 +2519,4 @@ def main(argv=None):


 if __name__ == "__main__":
-    sys.exit( main(sys.argv) )
+    sys.exit(main(sys.argv))