Update Markdown parser from 2.1.1 to 2.3.2

This commit is contained in:
Gregory Soutade 2020-03-20 16:55:45 +01:00
parent 4b642fa48a
commit 7cb4f1d3d7
1 changed files with 365 additions and 180 deletions

View File

@ -53,8 +53,9 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
* header-ids: Adds "id" attributes to headers. The id value is a slug of * header-ids: Adds "id" attributes to headers. The id value is a slug of
the header text. the header text.
* html-classes: Takes a dict mapping html tag names (lowercase) to a * html-classes: Takes a dict mapping html tag names (lowercase) to a
string to use for a "class" tag attribute. Currently only supports string to use for a "class" tag attribute. Currently only supports "img",
"pre" and "code" tags. Add an issue if you require this for other tags. "table", "pre" and "code" tags. Add an issue if you require this for other
tags.
* markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to * markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
have markdown processing be done on its contents. Similar to have markdown processing be done on its contents. Similar to
<http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with <http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
@ -70,9 +71,14 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
* smarty-pants: Replaces ' and " with curly quotation marks or curly * smarty-pants: Replaces ' and " with curly quotation marks or curly
apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes, apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,
and ellipses. and ellipses.
* spoiler: A special kind of blockquote commonly hidden behind a
click on SO. Syntax per <http://meta.stackexchange.com/a/72878>.
* toc: The returned HTML string gets a new "toc_html" attribute which is * toc: The returned HTML string gets a new "toc_html" attribute which is
a Table of Contents for the document. (experimental) a Table of Contents for the document. (experimental)
* xml: Passes one-liner processing instructions and namespaced XML tags. * xml: Passes one-liner processing instructions and namespaced XML tags.
* tables: Tables using the same format as GFM
<https://help.github.com/articles/github-flavored-markdown#tables> and
PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>.
* wiki-tables: Google Code Wiki-style tables. See * wiki-tables: Google Code Wiki-style tables. See
<http://code.google.com/p/support/wiki/WikiSyntax#Tables>. <http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
""" """
@ -82,13 +88,11 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
# not yet sure if there implications with this. Compare 'pydoc sre' # not yet sure if there implications with this. Compare 'pydoc sre'
# and 'perldoc perlre'. # and 'perldoc perlre'.
__version_info__ = (2, 1, 1) __version_info__ = (2, 3, 2)
__version__ = '.'.join(map(str, __version_info__)) __version__ = '.'.join(map(str, __version_info__))
__author__ = "Trent Mick" __author__ = "Trent Mick"
import os
import sys import sys
from pprint import pprint
import re import re
import logging import logging
try: try:
@ -102,13 +106,7 @@ import codecs
# ---- Python version compat # ---- Python version compat
try:
from urllib.parse import quote # python3
except ImportError:
from urllib import quote # python2
if sys.version_info[:2] < (2, 4): if sys.version_info[:2] < (2, 4):
from sets import Set as set
def reversed(sequence): def reversed(sequence):
for i in sequence[::-1]: for i in sequence[::-1]:
yield i yield i
@ -127,7 +125,6 @@ elif sys.version_info[0] >= 3:
base_string_type = str base_string_type = str
# ---- globals # ---- globals
DEBUG = False DEBUG = False
@ -145,14 +142,11 @@ g_escape_table = dict([(ch, _hash_text(ch))
for ch in '\\`*_{}[]()>#+-.!']) for ch in '\\`*_{}[]()>#+-.!'])
# ---- exceptions # ---- exceptions
class MarkdownError(Exception): class MarkdownError(Exception):
pass pass
# ---- public api # ---- public api
def markdown_path(path, encoding="utf-8", def markdown_path(path, encoding="utf-8",
@ -167,6 +161,7 @@ def markdown_path(path, encoding="utf-8",
link_patterns=link_patterns, link_patterns=link_patterns,
use_file_vars=use_file_vars).convert(text) use_file_vars=use_file_vars).convert(text)
def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH, def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
safe_mode=None, extras=None, link_patterns=None, safe_mode=None, extras=None, link_patterns=None,
use_file_vars=False): use_file_vars=False):
@ -175,6 +170,7 @@ def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
link_patterns=link_patterns, link_patterns=link_patterns,
use_file_vars=use_file_vars).convert(text) use_file_vars=use_file_vars).convert(text)
class Markdown(object): class Markdown(object):
# The dict of "extras" to enable in processing -- a mapping of # The dict of "extras" to enable in processing -- a mapping of
# extra name to argument for the extra. Most extras do not have an # extra name to argument for the extra. Most extras do not have an
@ -222,7 +218,7 @@ class Markdown(object):
extras = dict([(e, None) for e in extras]) extras = dict([(e, None) for e in extras])
self.extras.update(extras) self.extras.update(extras)
assert isinstance(self.extras, dict) assert isinstance(self.extras, dict)
if "toc" in self.extras and not "header-ids" in self.extras: if "toc" in self.extras and "header-ids" not in self.extras:
self.extras["header-ids"] = None # "toc" implies "header-ids" self.extras["header-ids"] = None # "toc" implies "header-ids"
self._instance_extras = self.extras.copy() self._instance_extras = self.extras.copy()
@ -254,6 +250,11 @@ class Markdown(object):
# should only be used in <a> tags with an "href" attribute. # should only be used in <a> tags with an "href" attribute.
_a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE) _a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE)
# Opens the linked document in a new window or tab
# should only used in <a> tags with an "target" attribute.
# same with _a_nofollow
_a_blank = _a_nofollow
def convert(self, text): def convert(self, text):
"""Convert the given text.""" """Convert the given text."""
# Main function. The order in which other subs are called here is # Main function. The order in which other subs are called here is
@ -288,7 +289,8 @@ class Markdown(object):
self.extras[ename] = earg self.extras[ename] = earg
# Standardize line endings: # Standardize line endings:
text = re.sub("\r\n|\r", "\n", text) text = text.replace("\r\n", "\n")
text = text.replace("\r", "\n")
# Make sure $text ends with a couple of newlines: # Make sure $text ends with a couple of newlines:
text += "\n\n" text += "\n\n"
@ -308,13 +310,16 @@ class Markdown(object):
text = self.preprocess(text) text = self.preprocess(text)
if "fenced-code-blocks" in self.extras and not self.safe_mode:
text = self._do_fenced_code_blocks(text)
if self.safe_mode: if self.safe_mode:
text = self._hash_html_spans(text) text = self._hash_html_spans(text)
# Turn block-level HTML blocks into hash entries # Turn block-level HTML blocks into hash entries
text = self._hash_html_blocks(text, raw=True) text = self._hash_html_blocks(text, raw=True)
if "fenced-code-blocks" in self.extras: if "fenced-code-blocks" in self.extras and self.safe_mode:
text = self._do_fenced_code_blocks(text) text = self._do_fenced_code_blocks(text)
# Strip link definitions, store in hashes. # Strip link definitions, store in hashes.
@ -340,6 +345,9 @@ class Markdown(object):
if "nofollow" in self.extras: if "nofollow" in self.extras:
text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text) text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text)
if "target-blank-links" in self.extras:
text = self._a_blank.sub(r'<\1 target="_blank"\2', text)
text += "\n" text += "\n"
rv = UnicodeWithAttrs(text) rv = UnicodeWithAttrs(text)
@ -363,18 +371,29 @@ class Markdown(object):
""" """
return text return text
# Is metadata if the content starts with '---'-fenced `key: value` # Is metadata if the content starts with optional '---'-fenced `key: value`
# pairs. E.g. (indented for presentation): # pairs. E.g. (indented for presentation):
# --- # ---
# foo: bar # foo: bar
# another-var: blah blah # another-var: blah blah
# --- # ---
_metadata_pat = re.compile("""^---[ \t]*\n((?:[ \t]*[^ \t:]+[ \t]*:[^\n]*\n)+)---[ \t]*\n""") # # header
# or:
# foo: bar
# another-var: blah blah
#
# # header
_metadata_pat = re.compile(r"""
^
(?:---[\ \t]*\n)? # optional "---"
((?:[ \t]*[^ \t:]+[\ \t]*:[^\n]*\n)+) # "key: value" pairs
(?:---[ \t]*)? # optional "---"
\n""",
re.VERBOSE
)
def _extract_metadata(self, text): def _extract_metadata(self, text):
# fast test
if not text.startswith("---"):
return text
match = self._metadata_pat.match(text) match = self._metadata_pat.match(text)
if not match: if not match:
return text return text
@ -387,7 +406,6 @@ class Markdown(object):
return tail return tail
_emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE) _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
# This regular expression is intended to match blocks like this: # This regular expression is intended to match blocks like this:
# PREFIX Local Variables: SUFFIX # PREFIX Local Variables: SUFFIX
@ -505,14 +523,19 @@ class Markdown(object):
return emacs_vars return emacs_vars
# Cribbed from a post by Bart Lateur: def _detab_line(self, line):
# <http://www.nntp.perl.org/group/perl.macperl.anyperl/154> r"""Recusively convert tabs to spaces in a single line.
_detab_re = re.compile(r'(.*?)\t', re.M)
def _detab_sub(self, match): Called from _detab()."""
g1 = match.group(1) if '\t' not in line:
return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width)) return line
chunk1, chunk2 = line.split('\t', 1)
chunk1 += (' ' * (self.tab_width - len(chunk1) % self.tab_width))
output = chunk1 + chunk2
return self._detab_line(output)
def _detab(self, text): def _detab(self, text):
r"""Remove (leading?) tabs from a file. r"""Iterate text line by line and convert tabs to spaces.
>>> m = Markdown() >>> m = Markdown()
>>> m._detab("\tfoo") >>> m._detab("\tfoo")
@ -528,7 +551,10 @@ class Markdown(object):
""" """
if '\t' not in text: if '\t' not in text:
return text return text
return self._detab_re.subn(self._detab_sub, text)[0] output = []
for line in text.splitlines():
output.append(self._detab_line(line))
return '\n'.join(output)
# I broke out the html5 tags here and add them to _block_tags_a and # I broke out the html5 tags here and add them to _block_tags_a and
# _block_tags_b. This way html5 tags are easy to keep track of. # _block_tags_b. This way html5 tags are easy to keep track of.
@ -776,12 +802,7 @@ class Markdown(object):
re.X | re.M) re.X | re.M)
return footnote_def_re.sub(self._extract_footnote_def_sub, text) return footnote_def_re.sub(self._extract_footnote_def_sub, text)
_hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M)
_hr_data = [
('*', re.compile(r"^[ ]{0,3}\*(.*?)$", re.M)),
('-', re.compile(r"^[ ]{0,3}\-(.*?)$", re.M)),
('_', re.compile(r"^[ ]{0,3}\_(.*?)$", re.M)),
]
def _run_block_gamut(self, text): def _run_block_gamut(self, text):
# These are all the transformations that form block-level # These are all the transformations that form block-level
@ -798,13 +819,7 @@ class Markdown(object):
# Markdown.pl 1.0.1's hr regexes limit the number of spaces between the # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
# hr chars to one or two. We'll reproduce that limit here. # hr chars to one or two. We'll reproduce that limit here.
hr = "\n<hr"+self.empty_element_suffix+"\n" hr = "\n<hr"+self.empty_element_suffix+"\n"
for ch, regex in self._hr_data: text = re.sub(self._hr_re, hr, text)
if ch in text:
for m in reversed(list(regex.finditer(text))):
tail = m.group(1).rstrip()
if not tail.strip(ch + ' ') and tail.count(" ") == 0:
start, end = m.span()
text = text[:start] + hr + text[end:]
text = self._do_lists(text) text = self._do_lists(text)
@ -812,6 +827,8 @@ class Markdown(object):
text = self._prepare_pyshell_blocks(text) text = self._prepare_pyshell_blocks(text)
if "wiki-tables" in self.extras: if "wiki-tables" in self.extras:
text = self._do_wiki_tables(text) text = self._do_wiki_tables(text)
if "tables" in self.extras:
text = self._do_tables(text)
text = self._do_code_blocks(text) text = self._do_code_blocks(text)
@ -852,6 +869,79 @@ class Markdown(object):
return _pyshell_block_re.sub(self._pyshell_block_sub, text) return _pyshell_block_re.sub(self._pyshell_block_sub, text)
def _table_sub(self, match):
trim_space_re = '^[ \t\n]+|[ \t\n]+$'
trim_bar_re = '^\||\|$'
head, underline, body = match.groups()
# Determine aligns for columns.
cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)).split('|')]
align_from_col_idx = {}
for col_idx, col in enumerate(cols):
if col[0] == ':' and col[-1] == ':':
align_from_col_idx[col_idx] = ' align="center"'
elif col[0] == ':':
align_from_col_idx[col_idx] = ' align="left"'
elif col[-1] == ':':
align_from_col_idx[col_idx] = ' align="right"'
# thead
hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<thead>', '<tr>']
cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)).split('|')]
for col_idx, col in enumerate(cols):
hlines.append(' <th%s>%s</th>' % (
align_from_col_idx.get(col_idx, ''),
self._run_span_gamut(col)
))
hlines.append('</tr>')
hlines.append('</thead>')
# tbody
hlines.append('<tbody>')
for line in body.strip('\n').split('\n'):
hlines.append('<tr>')
cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)).split('|')]
for col_idx, col in enumerate(cols):
hlines.append(' <td%s>%s</td>' % (
align_from_col_idx.get(col_idx, ''),
self._run_span_gamut(col)
))
hlines.append('</tr>')
hlines.append('</tbody>')
hlines.append('</table>')
return '\n'.join(hlines) + '\n'
def _do_tables(self, text):
"""Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
"""
less_than_tab = self.tab_width - 1
table_re = re.compile(r'''
(?:(?<=\n\n)|\A\n?) # leading blank line
^[ ]{0,%d} # allowed whitespace
(.*[|].*) \n # $1: header row (at least one pipe)
^[ ]{0,%d} # allowed whitespace
( # $2: underline row
# underline row with leading bar
(?: \|\ *:?-+:?\ * )+ \|? \n
|
# or, underline row without leading bar
(?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \n
)
( # $3: data rows
(?:
^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces
.*\|.* \n
)+
)
''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
return table_re.sub(self._table_sub, text)
def _wiki_table_sub(self, match): def _wiki_table_sub(self, match):
ttext = match.group(0).strip() ttext = match.group(0).strip()
# print 'wiki table: %r' % match.group(0) # print 'wiki table: %r' % match.group(0)
@ -861,7 +951,7 @@ class Markdown(object):
row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)] row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
rows.append(row) rows.append(row)
# pprint(rows) # pprint(rows)
hlines = ['<table>', '<tbody>'] hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<tbody>']
for row in rows: for row in rows:
hrow = ['<tr>'] hrow = ['<tr>']
for cell in row: for cell in row:
@ -907,12 +997,18 @@ class Markdown(object):
text = self._encode_amps_and_angles(text) text = self._encode_amps_and_angles(text)
if "strike" in self.extras:
text = self._do_strike(text)
text = self._do_italics_and_bold(text) text = self._do_italics_and_bold(text)
if "smarty-pants" in self.extras: if "smarty-pants" in self.extras:
text = self._do_smart_punctuation(text) text = self._do_smart_punctuation(text)
# Do hard breaks: # Do hard breaks:
if "break-on-newline" in self.extras:
text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text)
else:
text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text) text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
return text return text
@ -1003,22 +1099,14 @@ class Markdown(object):
raise MarkdownError("invalid value for 'safe_mode': %r (must be " raise MarkdownError("invalid value for 'safe_mode': %r (must be "
"'escape' or 'replace')" % self.safe_mode) "'escape' or 'replace')" % self.safe_mode)
_tail_of_inline_link_re = re.compile(r''' _inline_link_title = re.compile(r'''
# Match tail of: [text](/url/) or [text](/url/ "title") ( # \1
\( # literal paren [ \t]+
[ \t]* (['"]) # quote char = \2
(?P<url> # \1
<.*?>
|
.*?
)
[ \t]*
( # \2
(['"]) # quote char = \3
(?P<title>.*?) (?P<title>.*?)
\3 # matching quote \2
)? # title is optional )? # title is optional
\) \)$
''', re.X | re.S) ''', re.X | re.S)
_tail_of_reference_link_re = re.compile(r''' _tail_of_reference_link_re = re.compile(r'''
# Match tail of: [text][id] # Match tail of: [text][id]
@ -1029,6 +1117,52 @@ class Markdown(object):
\] \]
''', re.X | re.S) ''', re.X | re.S)
_whitespace = re.compile(r'\s*')
_strip_anglebrackets = re.compile(r'<(.*)>.*')
def _find_non_whitespace(self, text, start):
"""Returns the index of the first non-whitespace character in text
after (and including) start
"""
match = self._whitespace.match(text, start)
return match.end()
def _find_balanced(self, text, start, open_c, close_c):
"""Returns the index where the open_c and close_c characters balance
out - the same number of open_c and close_c are encountered - or the
end of string if it's reached before the balance point is found.
"""
i = start
l = len(text)
count = 1
while count > 0 and i < l:
if text[i] == open_c:
count += 1
elif text[i] == close_c:
count -= 1
i += 1
return i
def _extract_url_and_title(self, text, start):
"""Extracts the url and (optional) title from the tail of a link"""
# text[start] equals the opening parenthesis
idx = self._find_non_whitespace(text, start+1)
if idx == len(text):
return None, None, None
end_idx = idx
has_anglebrackets = text[idx] == "<"
if has_anglebrackets:
end_idx = self._find_balanced(text, end_idx+1, "<", ">")
end_idx = self._find_balanced(text, end_idx, "(", ")")
match = self._inline_link_title.search(text, idx, end_idx)
if not match:
return None, None, None
url, title = text[idx:match.start()], match.group("title")
if has_anglebrackets:
url = self._strip_anglebrackets.sub(r'\1', url)
return url, title, end_idx
def _do_links(self, text): def _do_links(self, text):
"""Turn Markdown link shortcuts into XHTML <a> and <img> tags. """Turn Markdown link shortcuts into XHTML <a> and <img> tags.
@ -1111,8 +1245,8 @@ class Markdown(object):
# Inline anchor or img? # Inline anchor or img?
if text[p] == '(': # attempt at perf improvement if text[p] == '(': # attempt at perf improvement
match = self._tail_of_inline_link_re.match(text, p) url, title, url_end_idx = self._extract_url_and_title(text, p)
if match: if url is not None:
# Handle an inline anchor or img. # Handle an inline anchor or img.
is_img = start_idx > 0 and text[start_idx-1] == "!" is_img = start_idx > 0 and text[start_idx-1] == "!"
if is_img: if is_img:
@ -1123,9 +1257,6 @@ class Markdown(object):
start_idx -= 1 start_idx -= 1
is_img = 1 is_img = 1
url, title = match.group("url"), match.group("title")
if url and url[0] == '<':
url = url[1:-1] # '<url>' -> 'url'
# We've got to encode these to avoid conflicting # We've got to encode these to avoid conflicting
# with italics/bold. # with italics/bold.
url = url.replace('*', self._escape_table['*']) \ url = url.replace('*', self._escape_table['*']) \
@ -1138,20 +1269,17 @@ class Markdown(object):
else: else:
title_str = '' title_str = ''
if is_img: if is_img:
img_class_str = self._html_class_str_from_tag("img")
if is_inline_img: if is_inline_img:
result = '<img class="inlineimage" src="%s" alt="%s"%s%s' \ img_class_str = ' class="inlineimage"'
result = '<img src="%s" alt="%s"%s%s%s' \
% (url.replace('"', '&quot;'), % (url.replace('"', '&quot;'),
_xml_escape_attr(link_text), _xml_escape_attr(link_text),
title_str, self.empty_element_suffix) title_str, img_class_str, self.empty_element_suffix)
else:
result = '<img src="%s" alt="%s"%s%s' \
% (url.replace('"', '&quot;'),
_xml_escape_attr(link_text),
title_str, self.empty_element_suffix)
if "smarty-pants" in self.extras: if "smarty-pants" in self.extras:
result = result.replace('"', self._escape_table['"']) result = result.replace('"', self._escape_table['"'])
curr_pos = start_idx + len(result) curr_pos = start_idx + len(result)
text = text[:start_idx] + result + text[match.end():] text = text[:start_idx] + result + text[url_end_idx:]
elif start_idx >= anchor_allowed_pos: elif start_idx >= anchor_allowed_pos:
result_head = '<a href="%s"%s>' % (url, title_str) result_head = '<a href="%s"%s>' % (url, title_str)
result = '%s%s</a>' % (result_head, link_text) result = '%s%s</a>' % (result_head, link_text)
@ -1161,7 +1289,7 @@ class Markdown(object):
# anchor_allowed_pos on. # anchor_allowed_pos on.
curr_pos = start_idx + len(result_head) curr_pos = start_idx + len(result_head)
anchor_allowed_pos = start_idx + len(result) anchor_allowed_pos = start_idx + len(result)
text = text[:start_idx] + result + text[match.end():] text = text[:start_idx] + result + text[url_end_idx:]
else: else:
# Anchor not allowed here. # Anchor not allowed here.
curr_pos = start_idx + 1 curr_pos = start_idx + 1
@ -1186,7 +1314,6 @@ class Markdown(object):
.replace('_', self._escape_table['_']) .replace('_', self._escape_table['_'])
title = self.titles.get(link_id) title = self.titles.get(link_id)
if title: if title:
before = title
title = _xml_escape_attr(title) \ title = _xml_escape_attr(title) \
.replace('*', self._escape_table['*']) \ .replace('*', self._escape_table['*']) \
.replace('_', self._escape_table['_']) .replace('_', self._escape_table['_'])
@ -1194,10 +1321,11 @@ class Markdown(object):
else: else:
title_str = '' title_str = ''
if is_img: if is_img:
result = '<img src="%s" alt="%s"%s%s' \ img_class_str = self._html_class_str_from_tag("img")
result = '<img src="%s" alt="%s"%s%s%s' \
% (url.replace('"', '&quot;'), % (url.replace('"', '&quot;'),
link_text.replace('"', '&quot;'), link_text.replace('"', '&quot;'),
title_str, self.empty_element_suffix) title_str, img_class_str, self.empty_element_suffix)
if "smarty-pants" in self.extras: if "smarty-pants" in self.extras:
result = result.replace('"', self._escape_table['"']) result = result.replace('"', self._escape_table['"'])
curr_pos = start_idx + len(result) curr_pos = start_idx + len(result)
@ -1258,44 +1386,42 @@ class Markdown(object):
self._toc = [] self._toc = []
self._toc.append((level, id, self._unescape_special_chars(name))) self._toc.append((level, id, self._unescape_special_chars(name)))
_setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M) _h_re_base = r'''
def _setext_h_sub(self, match): (^(.+)[ \t]*\n(=+|-+)[ \t]*\n+)
n = {"=": 1, "-": 2}[match.group(2)[0]] |
demote_headers = self.extras.get("demote-headers") (^(\#{1,6}) # \1 = string of #'s
if demote_headers: [ \t]%s
n = min(n + demote_headers, 6)
header_id_attr = ""
if "header-ids" in self.extras:
header_id = self.header_id_from_text(match.group(1),
self.extras["header-ids"], n)
if header_id:
header_id_attr = ' id="%s"' % header_id
html = self._run_span_gamut(match.group(1))
if "toc" in self.extras and header_id:
self._toc_add_entry(n, header_id, html)
return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
_atx_h_re = re.compile(r'''
^(\#{1,6}) # \1 = string of #'s
[ \t]+
(.+?) # \2 = Header text (.+?) # \2 = Header text
[ \t]* [ \t]*
(?<!\\) # ensure not an escaped trailing '#' (?<!\\) # ensure not an escaped trailing '#'
\#* # optional closing #'s (not counted) \#* # optional closing #'s (not counted)
\n+ \n+
''', re.X | re.M) )
def _atx_h_sub(self, match): '''
n = len(match.group(1))
_h_re = re.compile(_h_re_base % '*', re.X | re.M)
_h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)
def _h_sub(self, match):
if match.group(1) is not None:
# Setext header
n = {"=": 1, "-": 2}[match.group(3)[0]]
header_group = match.group(2)
else:
# atx header
n = len(match.group(5))
header_group = match.group(6)
demote_headers = self.extras.get("demote-headers") demote_headers = self.extras.get("demote-headers")
if demote_headers: if demote_headers:
n = min(n + demote_headers, 6) n = min(n + demote_headers, 6)
header_id_attr = "" header_id_attr = ""
if "header-ids" in self.extras: if "header-ids" in self.extras:
header_id = self.header_id_from_text(match.group(2), header_id = self.header_id_from_text(header_group,
self.extras["header-ids"], n) self.extras["header-ids"], n)
if header_id: if header_id:
header_id_attr = ' id="%s"' % header_id header_id_attr = ' id="%s"' % header_id
html = self._run_span_gamut(match.group(2)) html = self._run_span_gamut(header_group)
if "toc" in self.extras and header_id: if "toc" in self.extras and header_id:
self._toc_add_entry(n, header_id, html) self._toc_add_entry(n, header_id, html)
return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n) return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
@ -1307,7 +1433,6 @@ class Markdown(object):
# #
# Header 2 # Header 2
# -------- # --------
text = self._setext_h_re.sub(self._setext_h_sub, text)
# atx-style headers: # atx-style headers:
# # Header 1 # # Header 1
@ -1315,10 +1440,10 @@ class Markdown(object):
# ## Header 2 with closing hashes ## # ## Header 2 with closing hashes ##
# ... # ...
# ###### Header 6 # ###### Header 6
text = self._atx_h_re.sub(self._atx_h_sub, text)
return text
if 'tag-friendly' in self.extras:
return self._h_re_tag_friendly.sub(self._h_sub, text)
return self._h_re.sub(self._h_sub, text)
_marker_ul_chars = '*+-' _marker_ul_chars = '*+-'
_marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
@ -1380,8 +1505,9 @@ class Markdown(object):
hits.sort() hits.sort()
match = hits[0][1] match = hits[0][1]
start, end = match.span() start, end = match.span()
text = text[:start] + self._list_sub(match) + text[end:] middle = self._list_sub(match)
pos = end text = text[:start] + middle + text[end:]
pos = start + len(middle) # start pos for next attempted match
return text return text
@ -1395,11 +1521,25 @@ class Markdown(object):
''' % (_marker_any, _marker_any), ''' % (_marker_any, _marker_any),
re.M | re.X | re.S) re.M | re.X | re.S)
_task_list_item_re = re.compile(r'''
(\[[\ x]\])[ \t]+ # tasklist marker = \1
(.*) # list item text = \2
''', re.M | re.X | re.S)
_task_list_warpper_str = r'<p><input type="checkbox" class="task-list-item-checkbox" %sdisabled>%s</p>'
def _task_list_item_sub(self, match):
marker = match.group(1)
item_text = match.group(2)
if marker == '[x]':
return self._task_list_warpper_str % ('checked ', item_text)
elif marker == '[ ]':
return self._task_list_warpper_str % ('', item_text)
_last_li_endswith_two_eols = False _last_li_endswith_two_eols = False
def _list_item_sub(self, match): def _list_item_sub(self, match):
item = match.group(4) item = match.group(4)
leading_line = match.group(1) leading_line = match.group(1)
leading_space = match.group(2)
if leading_line or "\n\n" in item or self._last_li_endswith_two_eols: if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
item = self._run_block_gamut(self._outdent(item)) item = self._run_block_gamut(self._outdent(item))
else: else:
@ -1409,6 +1549,10 @@ class Markdown(object):
item = item[:-1] item = item[:-1]
item = self._run_span_gamut(item) item = self._run_span_gamut(item)
self._last_li_endswith_two_eols = (len(match.group(5)) == 2) self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
if "task_list" in self.extras:
item = self._task_list_item_re.sub(self._task_list_item_sub, item)
return "<li>%s</li>\n" % item return "<li>%s</li>\n" % item
def _process_list_items(self, list_str): def _process_list_items(self, list_str):
@ -1497,8 +1641,20 @@ class Markdown(object):
formatter_opts = self.extras['code-color'] or {} formatter_opts = self.extras['code-color'] or {}
if lexer_name: if lexer_name:
def unhash_code(codeblock):
for key, sanitized in list(self.html_spans.items()):
codeblock = codeblock.replace(key, sanitized)
replacements = [
("&amp;", "&"),
("&lt;", "<"),
("&gt;", ">")
]
for old, new in replacements:
codeblock = codeblock.replace(old, new)
return codeblock
lexer = self._get_pygments_lexer(lexer_name) lexer = self._get_pygments_lexer(lexer_name)
if lexer: if lexer:
codeblock = unhash_code( codeblock )
colored = self._color_with_pygments(codeblock, lexer, colored = self._color_with_pygments(codeblock, lexer,
**formatter_opts) **formatter_opts)
return "\n\n%s\n\n" % colored return "\n\n%s\n\n" % colored
@ -1535,19 +1691,22 @@ class Markdown(object):
)+ )+
) )
((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
# Lookahead to make sure this block isn't already in a code block.
# Needed when syntax highlighting is being used.
(?![^<]*\</code\>)
''' % (self.tab_width, self.tab_width), ''' % (self.tab_width, self.tab_width),
re.M | re.X) re.M | re.X)
return code_block_re.sub(self._code_block_sub, text) return code_block_re.sub(self._code_block_sub, text)
_fenced_code_block_re = re.compile(r''' _fenced_code_block_re = re.compile(r'''
(?:\n\n|\A\n?) (?:\n+|\A\n?)
^```([\w+-]+)?[ \t]*\n # opening fence, $1 = optional lang ^```([\w+-]+)?[ \t]*\n # opening fence, $1 = optional lang
(.*?) # $2 = code block content (.*?) # $2 = code block content
^```[ \t]*\n # closing fence ^```[ \t]*\n # closing fence
''', re.M | re.X | re.S) ''', re.M | re.X | re.S)
def _fenced_code_block_sub(self, match): def _fenced_code_block_sub(self, match):
return self._code_block_sub(match, is_fenced_code_block=True); return self._code_block_sub(match, is_fenced_code_block=True)
def _do_fenced_code_blocks(self, text): def _do_fenced_code_blocks(self, text):
"""Process ```-fenced unindented code blocks ('fenced-code-blocks' extra).""" """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
@ -1620,12 +1779,17 @@ class Markdown(object):
self._escape_table[text] = hashed self._escape_table[text] = hashed
return hashed return hashed
_strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S)
def _do_strike(self, text):
text = self._strike_re.sub(r"<strike>\1</strike>", text)
return text
_strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S) _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
_em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S) _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
_code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
_code_friendly_line_re = re.compile(r"\~\~(?=\S)(.+?)(?<=\S)\~\~", re.S) _code_friendly_line_re = re.compile(r"\~\~(?=\S)(.+?)(?<=\S)\~\~", re.S)
_code_friendly_underline_re = re.compile(r"\~(?=\S)(.+?)(?<=\S)\~", re.S) _code_friendly_underline_re = re.compile(r"\~(?=\S)(.+?)(?<=\S)\~", re.S)
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
_code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
def _do_italics_and_bold(self, text): def _do_italics_and_bold(self, text):
# <strong> must go first: # <strong> must go first:
if "code-friendly" in self.extras: if "code-friendly" in self.extras:
@ -1686,37 +1850,52 @@ class Markdown(object):
text = text.replace(". . .", "&#8230;") text = text.replace(". . .", "&#8230;")
return text return text
_block_quote_re = re.compile(r''' _block_quote_base = r'''
( # Wrap whole match in \1 ( # Wrap whole match in \1
( (
^[ \t]*>[ \t]? # '>' at the start of a line ^[ \t]*>%s[ \t]? # '>' at the start of a line
.+\n # rest of the first line .+\n # rest of the first line
(.+\n)* # subsequent consecutive lines (.+\n)* # subsequent consecutive lines
\n* # blanks \n* # blanks
)+ )+
) )
''', re.M | re.X) '''
_bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M); _block_quote_re = re.compile(_block_quote_base % '', re.M | re.X)
_block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M | re.X)
_bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M)
_bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M)
_bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M)
_html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S) _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
def _dedent_two_spaces_sub(self, match): def _dedent_two_spaces_sub(self, match):
return re.sub(r'(?m)^ ', '', match.group(1)) return re.sub(r'(?m)^ ', '', match.group(1))
def _block_quote_sub(self, match): def _block_quote_sub(self, match):
bq = match.group(1) bq = match.group(1)
bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq)
bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines # trim one level of quoting
if is_spoiler:
bq = self._bq_one_level_re_spoiler.sub('', bq)
else:
bq = self._bq_one_level_re.sub('', bq)
# trim whitespace-only lines
bq = self._ws_only_line_re.sub('', bq)
bq = self._run_block_gamut(bq) # recurse bq = self._run_block_gamut(bq) # recurse
bq = re.sub('(?m)^', ' ', bq) bq = re.sub('(?m)^', ' ', bq)
# These leading spaces screw with <pre> content, so we need to fix that: # These leading spaces screw with <pre> content, so we need to fix that:
bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq) bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
return "<blockquote>\n%s\n</blockquote>\n\n" % bq if is_spoiler:
return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq
else:
return '<blockquote>\n%s\n</blockquote>\n\n' % bq
def _do_block_quotes(self, text): def _do_block_quotes(self, text):
if '>' not in text: if '>' not in text:
return text return text
if 'spoiler' in self.extras:
return self._block_quote_re_spoiler.sub(self._block_quote_sub, text)
else:
return self._block_quote_re.sub(self._block_quote_sub, text) return self._block_quote_re.sub(self._block_quote_sub, text)
def _form_paragraphs(self, text): def _form_paragraphs(self, text):
@ -1774,7 +1953,7 @@ class Markdown(object):
'&#8617;</a>' % (id, i+1)) '&#8617;</a>' % (id, i+1))
if footer[-1].endswith("</p>"): if footer[-1].endswith("</p>"):
footer[-1] = footer[-1][:-len("</p>")] \ footer[-1] = footer[-1][:-len("</p>")] \
+ '&nbsp;' + backlink + "</p>" + '&#160;' + backlink + "</p>"
else: else:
footer.append("\n<p>%s</p>" % backlink) footer.append("\n<p>%s</p>" % backlink)
footer.append('</li>') footer.append('</li>')
@ -1979,6 +2158,7 @@ def _curry(*args, **kwargs):
return function(*args + rest, **combined) return function(*args + rest, **combined)
return result return result
# Recipe: regex_from_encoded_pattern (1.0) # Recipe: regex_from_encoded_pattern (1.0)
def _regex_from_encoded_pattern(s): def _regex_from_encoded_pattern(s):
"""'foo' -> re.compile(re.escape('foo')) """'foo' -> re.compile(re.escape('foo'))
@ -2008,6 +2188,7 @@ def _regex_from_encoded_pattern(s):
else: # not an encoded regex else: # not an encoded regex
return re.compile(re.escape(s)) return re.compile(re.escape(s))
# Recipe: dedent (0.1.2) # Recipe: dedent (0.1.2)
def _dedentlines(lines, tabsize=8, skip_first_line=False): def _dedentlines(lines, tabsize=8, skip_first_line=False):
"""_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
@ -2025,7 +2206,6 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
if DEBUG: if DEBUG:
print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
% (tabsize, skip_first_line)) % (tabsize, skip_first_line))
indents = []
margin = None margin = None
for i, line in enumerate(lines): for i, line in enumerate(lines):
if i == 0 and skip_first_line: continue if i == 0 and skip_first_line: continue
@ -2079,6 +2259,7 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
lines[i] = lines[i][removed:] lines[i] = lines[i][removed:]
return lines return lines
def _dedent(text, tabsize=8, skip_first_line=False): def _dedent(text, tabsize=8, skip_first_line=False):
"""_dedent(text, tabsize=8, skip_first_line=False) -> dedented text """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
@ -2105,6 +2286,7 @@ class _memoized(object):
def __init__(self, func): def __init__(self, func):
self.func = func self.func = func
self.cache = {} self.cache = {}
def __call__(self, *args): def __call__(self, *args):
try: try:
return self.cache[args] return self.cache[args]
@ -2115,6 +2297,7 @@ class _memoized(object):
# uncachable -- for instance, passing a list as an argument. # uncachable -- for instance, passing a list as an argument.
# Better to not cache than to blow up entirely. # Better to not cache than to blow up entirely.
return self.func(*args) return self.func(*args)
def __repr__(self): def __repr__(self):
"""Return the function's docstring.""" """Return the function's docstring."""
return self.func.__doc__ return self.func.__doc__
@ -2141,6 +2324,7 @@ def _xml_oneliner_re_from_tab_width(tab_width):
""" % (tab_width - 1), re.X) """ % (tab_width - 1), re.X)
_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width) _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
def _hr_tag_re_from_tab_width(tab_width): def _hr_tag_re_from_tab_width(tab_width):
return re.compile(r""" return re.compile(r"""
(?: (?:
@ -2191,7 +2375,6 @@ def _xml_encode_email_char_at_random(ch):
return '&#%s;' % ord(ch) return '&#%s;' % ord(ch)
# ---- mainline # ---- mainline
class _NoReflowFormatter(optparse.IndentedHelpFormatter): class _NoReflowFormatter(optparse.IndentedHelpFormatter):
@ -2199,10 +2382,12 @@ class _NoReflowFormatter(optparse.IndentedHelpFormatter):
def format_description(self, description): def format_description(self, description):
return description or "" return description or ""
def _test(): def _test():
import doctest import doctest
doctest.testmod() doctest.testmod()
def main(argv=None): def main(argv=None):
if argv is None: if argv is None:
argv = sys.argv argv = sys.argv
@ -2319,7 +2504,7 @@ def main(argv=None):
sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
if extras and "toc" in extras: if extras and "toc" in extras:
log.debug("toc_html: " + log.debug("toc_html: " +
html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) str(html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')))
if opts.compare: if opts.compare:
test_dir = join(dirname(dirname(abspath(__file__))), "test") test_dir = join(dirname(dirname(abspath(__file__))), "test")
if exists(join(test_dir, "test_markdown2.py")): if exists(join(test_dir, "test_markdown2.py")):