Update Markdown parser from 2.1.1 to 2.3.2

This commit is contained in:
Gregory Soutade 2020-03-20 16:55:45 +01:00
parent 4b642fa48a
commit 7cb4f1d3d7
1 changed files with 365 additions and 180 deletions

View File

@ -53,8 +53,9 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
* header-ids: Adds "id" attributes to headers. The id value is a slug of
the header text.
* html-classes: Takes a dict mapping html tag names (lowercase) to a
string to use for a "class" tag attribute. Currently only supports
"pre" and "code" tags. Add an issue if you require this for other tags.
string to use for a "class" tag attribute. Currently only supports "img",
"table", "pre" and "code" tags. Add an issue if you require this for other
tags.
* markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
have markdown processing be done on its contents. Similar to
<http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
@ -70,9 +71,14 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
* smarty-pants: Replaces ' and " with curly quotation marks or curly
apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,
and ellipses.
* spoiler: A special kind of blockquote commonly hidden behind a
click on SO. Syntax per <http://meta.stackexchange.com/a/72878>.
* toc: The returned HTML string gets a new "toc_html" attribute which is
a Table of Contents for the document. (experimental)
* xml: Passes one-liner processing instructions and namespaced XML tags.
* tables: Tables using the same format as GFM
<https://help.github.com/articles/github-flavored-markdown#tables> and
PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>.
* wiki-tables: Google Code Wiki-style tables. See
<http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
"""
@ -82,13 +88,11 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
# not yet sure if there implications with this. Compare 'pydoc sre'
# and 'perldoc perlre'.
__version_info__ = (2, 1, 1)
__version_info__ = (2, 3, 2)
__version__ = '.'.join(map(str, __version_info__))
__author__ = "Trent Mick"
import os
import sys
from pprint import pprint
import re
import logging
try:
@ -100,15 +104,9 @@ from random import random, randint
import codecs
#---- Python version compat
# ---- Python version compat
try:
from urllib.parse import quote # python3
except ImportError:
from urllib import quote # python2
if sys.version_info[:2] < (2,4):
from sets import Set as set
if sys.version_info[:2] < (2, 4):
def reversed(sequence):
for i in sequence[::-1]:
yield i
@ -127,8 +125,7 @@ elif sys.version_info[0] >= 3:
base_string_type = str
#---- globals
# ---- globals
DEBUG = False
log = logging.getLogger("markdown")
@ -145,15 +142,12 @@ g_escape_table = dict([(ch, _hash_text(ch))
for ch in '\\`*_{}[]()>#+-.!'])
#---- exceptions
# ---- exceptions
class MarkdownError(Exception):
pass
#---- public api
# ---- public api
def markdown_path(path, encoding="utf-8",
html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
@ -167,6 +161,7 @@ def markdown_path(path, encoding="utf-8",
link_patterns=link_patterns,
use_file_vars=use_file_vars).convert(text)
def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
safe_mode=None, extras=None, link_patterns=None,
use_file_vars=False):
@ -175,6 +170,7 @@ def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
link_patterns=link_patterns,
use_file_vars=use_file_vars).convert(text)
class Markdown(object):
# The dict of "extras" to enable in processing -- a mapping of
# extra name to argument for the extra. Most extras do not have an
@ -222,7 +218,7 @@ class Markdown(object):
extras = dict([(e, None) for e in extras])
self.extras.update(extras)
assert isinstance(self.extras, dict)
if "toc" in self.extras and not "header-ids" in self.extras:
if "toc" in self.extras and "header-ids" not in self.extras:
self.extras["header-ids"] = None # "toc" implies "header-ids"
self._instance_extras = self.extras.copy()
@ -254,6 +250,11 @@ class Markdown(object):
# should only be used in <a> tags with an "href" attribute.
_a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE)
# Opens the linked document in a new window or tab
# should only used in <a> tags with an "target" attribute.
# same with _a_nofollow
_a_blank = _a_nofollow
def convert(self, text):
"""Convert the given text."""
# Main function. The order in which other subs are called here is
@ -268,7 +269,7 @@ class Markdown(object):
self.reset()
if not isinstance(text, unicode):
#TODO: perhaps shouldn't presume UTF-8 for string input?
# TODO: perhaps shouldn't presume UTF-8 for string input?
text = unicode(text, 'utf-8')
if self.use_file_vars:
@ -288,7 +289,8 @@ class Markdown(object):
self.extras[ename] = earg
# Standardize line endings:
text = re.sub("\r\n|\r", "\n", text)
text = text.replace("\r\n", "\n")
text = text.replace("\r", "\n")
# Make sure $text ends with a couple of newlines:
text += "\n\n"
@ -308,13 +310,16 @@ class Markdown(object):
text = self.preprocess(text)
if "fenced-code-blocks" in self.extras and not self.safe_mode:
text = self._do_fenced_code_blocks(text)
if self.safe_mode:
text = self._hash_html_spans(text)
# Turn block-level HTML blocks into hash entries
text = self._hash_html_blocks(text, raw=True)
if "fenced-code-blocks" in self.extras:
if "fenced-code-blocks" in self.extras and self.safe_mode:
text = self._do_fenced_code_blocks(text)
# Strip link definitions, store in hashes.
@ -340,6 +345,9 @@ class Markdown(object):
if "nofollow" in self.extras:
text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text)
if "target-blank-links" in self.extras:
text = self._a_blank.sub(r'<\1 target="_blank"\2', text)
text += "\n"
rv = UnicodeWithAttrs(text)
@ -363,18 +371,29 @@ class Markdown(object):
"""
return text
# Is metadata if the content starts with '---'-fenced `key: value`
# Is metadata if the content starts with optional '---'-fenced `key: value`
# pairs. E.g. (indented for presentation):
# ---
# foo: bar
# another-var: blah blah
# ---
_metadata_pat = re.compile("""^---[ \t]*\n((?:[ \t]*[^ \t:]+[ \t]*:[^\n]*\n)+)---[ \t]*\n""")
# # header
# or:
# foo: bar
# another-var: blah blah
#
# # header
_metadata_pat = re.compile(r"""
^
(?:---[\ \t]*\n)? # optional "---"
((?:[ \t]*[^ \t:]+[\ \t]*:[^\n]*\n)+) # "key: value" pairs
(?:---[ \t]*)? # optional "---"
\n""",
re.VERBOSE
)
def _extract_metadata(self, text):
# fast test
if not text.startswith("---"):
return text
match = self._metadata_pat.match(text)
if not match:
return text
@ -387,7 +406,6 @@ class Markdown(object):
return tail
_emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
# This regular expression is intended to match blocks like this:
# PREFIX Local Variables: SUFFIX
@ -448,7 +466,7 @@ class Markdown(object):
prefix = match.group("prefix")
suffix = match.group("suffix")
lines = match.group("content").splitlines(0)
#print "prefix=%r, suffix=%r, content=%r, lines: %s"\
# print "prefix=%r, suffix=%r, content=%r, lines: %s"\
# % (prefix, suffix, match.group("content"), lines)
# Validate the Local Variables block: proper prefix and suffix
@ -505,14 +523,19 @@ class Markdown(object):
return emacs_vars
# Cribbed from a post by Bart Lateur:
# <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
_detab_re = re.compile(r'(.*?)\t', re.M)
def _detab_sub(self, match):
g1 = match.group(1)
return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
def _detab_line(self, line):
r"""Recusively convert tabs to spaces in a single line.
Called from _detab()."""
if '\t' not in line:
return line
chunk1, chunk2 = line.split('\t', 1)
chunk1 += (' ' * (self.tab_width - len(chunk1) % self.tab_width))
output = chunk1 + chunk2
return self._detab_line(output)
def _detab(self, text):
r"""Remove (leading?) tabs from a file.
r"""Iterate text line by line and convert tabs to spaces.
>>> m = Markdown()
>>> m._detab("\tfoo")
@ -528,7 +551,10 @@ class Markdown(object):
"""
if '\t' not in text:
return text
return self._detab_re.subn(self._detab_sub, text)[0]
output = []
for line in text.splitlines():
output.append(self._detab_line(line))
return '\n'.join(output)
# I broke out the html5 tags here and add them to _block_tags_a and
# _block_tags_b. This way html5 tags are easy to keep track of.
@ -776,12 +802,7 @@ class Markdown(object):
re.X | re.M)
return footnote_def_re.sub(self._extract_footnote_def_sub, text)
_hr_data = [
('*', re.compile(r"^[ ]{0,3}\*(.*?)$", re.M)),
('-', re.compile(r"^[ ]{0,3}\-(.*?)$", re.M)),
('_', re.compile(r"^[ ]{0,3}\_(.*?)$", re.M)),
]
_hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M)
def _run_block_gamut(self, text):
# These are all the transformations that form block-level
@ -798,13 +819,7 @@ class Markdown(object):
# Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
# hr chars to one or two. We'll reproduce that limit here.
hr = "\n<hr"+self.empty_element_suffix+"\n"
for ch, regex in self._hr_data:
if ch in text:
for m in reversed(list(regex.finditer(text))):
tail = m.group(1).rstrip()
if not tail.strip(ch + ' ') and tail.count(" ") == 0:
start, end = m.span()
text = text[:start] + hr + text[end:]
text = re.sub(self._hr_re, hr, text)
text = self._do_lists(text)
@ -812,6 +827,8 @@ class Markdown(object):
text = self._prepare_pyshell_blocks(text)
if "wiki-tables" in self.extras:
text = self._do_wiki_tables(text)
if "tables" in self.extras:
text = self._do_tables(text)
text = self._do_code_blocks(text)
@ -852,16 +869,89 @@ class Markdown(object):
return _pyshell_block_re.sub(self._pyshell_block_sub, text)
def _table_sub(self, match):
trim_space_re = '^[ \t\n]+|[ \t\n]+$'
trim_bar_re = '^\||\|$'
head, underline, body = match.groups()
# Determine aligns for columns.
cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)).split('|')]
align_from_col_idx = {}
for col_idx, col in enumerate(cols):
if col[0] == ':' and col[-1] == ':':
align_from_col_idx[col_idx] = ' align="center"'
elif col[0] == ':':
align_from_col_idx[col_idx] = ' align="left"'
elif col[-1] == ':':
align_from_col_idx[col_idx] = ' align="right"'
# thead
hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<thead>', '<tr>']
cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)).split('|')]
for col_idx, col in enumerate(cols):
hlines.append(' <th%s>%s</th>' % (
align_from_col_idx.get(col_idx, ''),
self._run_span_gamut(col)
))
hlines.append('</tr>')
hlines.append('</thead>')
# tbody
hlines.append('<tbody>')
for line in body.strip('\n').split('\n'):
hlines.append('<tr>')
cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)).split('|')]
for col_idx, col in enumerate(cols):
hlines.append(' <td%s>%s</td>' % (
align_from_col_idx.get(col_idx, ''),
self._run_span_gamut(col)
))
hlines.append('</tr>')
hlines.append('</tbody>')
hlines.append('</table>')
return '\n'.join(hlines) + '\n'
def _do_tables(self, text):
"""Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
"""
less_than_tab = self.tab_width - 1
table_re = re.compile(r'''
(?:(?<=\n\n)|\A\n?) # leading blank line
^[ ]{0,%d} # allowed whitespace
(.*[|].*) \n # $1: header row (at least one pipe)
^[ ]{0,%d} # allowed whitespace
( # $2: underline row
# underline row with leading bar
(?: \|\ *:?-+:?\ * )+ \|? \n
|
# or, underline row without leading bar
(?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \n
)
( # $3: data rows
(?:
^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces
.*\|.* \n
)+
)
''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
return table_re.sub(self._table_sub, text)
def _wiki_table_sub(self, match):
ttext = match.group(0).strip()
#print 'wiki table: %r' % match.group(0)
# print 'wiki table: %r' % match.group(0)
rows = []
for line in ttext.splitlines(0):
line = line.strip()[2:-2].strip()
row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
rows.append(row)
#pprint(rows)
hlines = ['<table>', '<tbody>']
# pprint(rows)
hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<tbody>']
for row in rows:
hrow = ['<tr>']
for cell in row:
@ -907,12 +997,18 @@ class Markdown(object):
text = self._encode_amps_and_angles(text)
if "strike" in self.extras:
text = self._do_strike(text)
text = self._do_italics_and_bold(text)
if "smarty-pants" in self.extras:
text = self._do_smart_punctuation(text)
# Do hard breaks:
if "break-on-newline" in self.extras:
text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text)
else:
text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
return text
@ -1003,22 +1099,14 @@ class Markdown(object):
raise MarkdownError("invalid value for 'safe_mode': %r (must be "
"'escape' or 'replace')" % self.safe_mode)
_tail_of_inline_link_re = re.compile(r'''
# Match tail of: [text](/url/) or [text](/url/ "title")
\( # literal paren
[ \t]*
(?P<url> # \1
<.*?>
|
.*?
)
[ \t]*
( # \2
(['"]) # quote char = \3
_inline_link_title = re.compile(r'''
( # \1
[ \t]+
(['"]) # quote char = \2
(?P<title>.*?)
\3 # matching quote
\2
)? # title is optional
\)
\)$
''', re.X | re.S)
_tail_of_reference_link_re = re.compile(r'''
# Match tail of: [text][id]
@ -1029,6 +1117,52 @@ class Markdown(object):
\]
''', re.X | re.S)
_whitespace = re.compile(r'\s*')
_strip_anglebrackets = re.compile(r'<(.*)>.*')
def _find_non_whitespace(self, text, start):
"""Returns the index of the first non-whitespace character in text
after (and including) start
"""
match = self._whitespace.match(text, start)
return match.end()
def _find_balanced(self, text, start, open_c, close_c):
"""Returns the index where the open_c and close_c characters balance
out - the same number of open_c and close_c are encountered - or the
end of string if it's reached before the balance point is found.
"""
i = start
l = len(text)
count = 1
while count > 0 and i < l:
if text[i] == open_c:
count += 1
elif text[i] == close_c:
count -= 1
i += 1
return i
def _extract_url_and_title(self, text, start):
"""Extracts the url and (optional) title from the tail of a link"""
# text[start] equals the opening parenthesis
idx = self._find_non_whitespace(text, start+1)
if idx == len(text):
return None, None, None
end_idx = idx
has_anglebrackets = text[idx] == "<"
if has_anglebrackets:
end_idx = self._find_balanced(text, end_idx+1, "<", ">")
end_idx = self._find_balanced(text, end_idx, "(", ")")
match = self._inline_link_title.search(text, idx, end_idx)
if not match:
return None, None, None
url, title = text[idx:match.start()], match.group("title")
if has_anglebrackets:
url = self._strip_anglebrackets.sub(r'\1', url)
return url, title, end_idx
def _do_links(self, text):
"""Turn Markdown link shortcuts into XHTML <a> and <img> tags.
@ -1111,8 +1245,8 @@ class Markdown(object):
# Inline anchor or img?
if text[p] == '(': # attempt at perf improvement
match = self._tail_of_inline_link_re.match(text, p)
if match:
url, title, url_end_idx = self._extract_url_and_title(text, p)
if url is not None:
# Handle an inline anchor or img.
is_img = start_idx > 0 and text[start_idx-1] == "!"
if is_img:
@ -1123,9 +1257,6 @@ class Markdown(object):
start_idx -= 1
is_img = 1
url, title = match.group("url"), match.group("title")
if url and url[0] == '<':
url = url[1:-1] # '<url>' -> 'url'
# We've got to encode these to avoid conflicting
# with italics/bold.
url = url.replace('*', self._escape_table['*']) \
@ -1138,20 +1269,17 @@ class Markdown(object):
else:
title_str = ''
if is_img:
img_class_str = self._html_class_str_from_tag("img")
if is_inline_img:
result = '<img class="inlineimage" src="%s" alt="%s"%s%s' \
img_class_str = ' class="inlineimage"'
result = '<img src="%s" alt="%s"%s%s%s' \
% (url.replace('"', '&quot;'),
_xml_escape_attr(link_text),
title_str, self.empty_element_suffix)
else:
result = '<img src="%s" alt="%s"%s%s' \
% (url.replace('"', '&quot;'),
_xml_escape_attr(link_text),
title_str, self.empty_element_suffix)
title_str, img_class_str, self.empty_element_suffix)
if "smarty-pants" in self.extras:
result = result.replace('"', self._escape_table['"'])
curr_pos = start_idx + len(result)
text = text[:start_idx] + result + text[match.end():]
text = text[:start_idx] + result + text[url_end_idx:]
elif start_idx >= anchor_allowed_pos:
result_head = '<a href="%s"%s>' % (url, title_str)
result = '%s%s</a>' % (result_head, link_text)
@ -1161,7 +1289,7 @@ class Markdown(object):
# anchor_allowed_pos on.
curr_pos = start_idx + len(result_head)
anchor_allowed_pos = start_idx + len(result)
text = text[:start_idx] + result + text[match.end():]
text = text[:start_idx] + result + text[url_end_idx:]
else:
# Anchor not allowed here.
curr_pos = start_idx + 1
@ -1186,7 +1314,6 @@ class Markdown(object):
.replace('_', self._escape_table['_'])
title = self.titles.get(link_id)
if title:
before = title
title = _xml_escape_attr(title) \
.replace('*', self._escape_table['*']) \
.replace('_', self._escape_table['_'])
@ -1194,10 +1321,11 @@ class Markdown(object):
else:
title_str = ''
if is_img:
result = '<img src="%s" alt="%s"%s%s' \
img_class_str = self._html_class_str_from_tag("img")
result = '<img src="%s" alt="%s"%s%s%s' \
% (url.replace('"', '&quot;'),
link_text.replace('"', '&quot;'),
title_str, self.empty_element_suffix)
title_str, img_class_str, self.empty_element_suffix)
if "smarty-pants" in self.extras:
result = result.replace('"', self._escape_table['"'])
curr_pos = start_idx + len(result)
@ -1258,44 +1386,42 @@ class Markdown(object):
self._toc = []
self._toc.append((level, id, self._unescape_special_chars(name)))
_setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
def _setext_h_sub(self, match):
n = {"=": 1, "-": 2}[match.group(2)[0]]
demote_headers = self.extras.get("demote-headers")
if demote_headers:
n = min(n + demote_headers, 6)
header_id_attr = ""
if "header-ids" in self.extras:
header_id = self.header_id_from_text(match.group(1),
self.extras["header-ids"], n)
if header_id:
header_id_attr = ' id="%s"' % header_id
html = self._run_span_gamut(match.group(1))
if "toc" in self.extras and header_id:
self._toc_add_entry(n, header_id, html)
return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
_atx_h_re = re.compile(r'''
^(\#{1,6}) # \1 = string of #'s
[ \t]+
_h_re_base = r'''
(^(.+)[ \t]*\n(=+|-+)[ \t]*\n+)
|
(^(\#{1,6}) # \1 = string of #'s
[ \t]%s
(.+?) # \2 = Header text
[ \t]*
(?<!\\) # ensure not an escaped trailing '#'
\#* # optional closing #'s (not counted)
\n+
''', re.X | re.M)
def _atx_h_sub(self, match):
n = len(match.group(1))
)
'''
_h_re = re.compile(_h_re_base % '*', re.X | re.M)
_h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)
def _h_sub(self, match):
if match.group(1) is not None:
# Setext header
n = {"=": 1, "-": 2}[match.group(3)[0]]
header_group = match.group(2)
else:
# atx header
n = len(match.group(5))
header_group = match.group(6)
demote_headers = self.extras.get("demote-headers")
if demote_headers:
n = min(n + demote_headers, 6)
header_id_attr = ""
if "header-ids" in self.extras:
header_id = self.header_id_from_text(match.group(2),
header_id = self.header_id_from_text(header_group,
self.extras["header-ids"], n)
if header_id:
header_id_attr = ' id="%s"' % header_id
html = self._run_span_gamut(match.group(2))
html = self._run_span_gamut(header_group)
if "toc" in self.extras and header_id:
self._toc_add_entry(n, header_id, html)
return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
@ -1307,7 +1433,6 @@ class Markdown(object):
#
# Header 2
# --------
text = self._setext_h_re.sub(self._setext_h_sub, text)
# atx-style headers:
# # Header 1
@ -1315,10 +1440,10 @@ class Markdown(object):
# ## Header 2 with closing hashes ##
# ...
# ###### Header 6
text = self._atx_h_re.sub(self._atx_h_sub, text)
return text
if 'tag-friendly' in self.extras:
return self._h_re_tag_friendly.sub(self._h_sub, text)
return self._h_re.sub(self._h_sub, text)
_marker_ul_chars = '*+-'
_marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
@ -1380,8 +1505,9 @@ class Markdown(object):
hits.sort()
match = hits[0][1]
start, end = match.span()
text = text[:start] + self._list_sub(match) + text[end:]
pos = end
middle = self._list_sub(match)
text = text[:start] + middle + text[end:]
pos = start + len(middle) # start pos for next attempted match
return text
@ -1395,11 +1521,25 @@ class Markdown(object):
''' % (_marker_any, _marker_any),
re.M | re.X | re.S)
_task_list_item_re = re.compile(r'''
(\[[\ x]\])[ \t]+ # tasklist marker = \1
(.*) # list item text = \2
''', re.M | re.X | re.S)
_task_list_warpper_str = r'<p><input type="checkbox" class="task-list-item-checkbox" %sdisabled>%s</p>'
def _task_list_item_sub(self, match):
marker = match.group(1)
item_text = match.group(2)
if marker == '[x]':
return self._task_list_warpper_str % ('checked ', item_text)
elif marker == '[ ]':
return self._task_list_warpper_str % ('', item_text)
_last_li_endswith_two_eols = False
def _list_item_sub(self, match):
item = match.group(4)
leading_line = match.group(1)
leading_space = match.group(2)
if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
item = self._run_block_gamut(self._outdent(item))
else:
@ -1409,6 +1549,10 @@ class Markdown(object):
item = item[:-1]
item = self._run_span_gamut(item)
self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
if "task_list" in self.extras:
item = self._task_list_item_re.sub(self._task_list_item_sub, item)
return "<li>%s</li>\n" % item
def _process_list_items(self, list_str):
@ -1497,8 +1641,20 @@ class Markdown(object):
formatter_opts = self.extras['code-color'] or {}
if lexer_name:
def unhash_code(codeblock):
for key, sanitized in list(self.html_spans.items()):
codeblock = codeblock.replace(key, sanitized)
replacements = [
("&amp;", "&"),
("&lt;", "<"),
("&gt;", ">")
]
for old, new in replacements:
codeblock = codeblock.replace(old, new)
return codeblock
lexer = self._get_pygments_lexer(lexer_name)
if lexer:
codeblock = unhash_code( codeblock )
colored = self._color_with_pygments(codeblock, lexer,
**formatter_opts)
return "\n\n%s\n\n" % colored
@ -1535,19 +1691,22 @@ class Markdown(object):
)+
)
((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
# Lookahead to make sure this block isn't already in a code block.
# Needed when syntax highlighting is being used.
(?![^<]*\</code\>)
''' % (self.tab_width, self.tab_width),
re.M | re.X)
return code_block_re.sub(self._code_block_sub, text)
_fenced_code_block_re = re.compile(r'''
(?:\n\n|\A\n?)
(?:\n+|\A\n?)
^```([\w+-]+)?[ \t]*\n # opening fence, $1 = optional lang
(.*?) # $2 = code block content
^```[ \t]*\n # closing fence
''', re.M | re.X | re.S)
def _fenced_code_block_sub(self, match):
return self._code_block_sub(match, is_fenced_code_block=True);
return self._code_block_sub(match, is_fenced_code_block=True)
def _do_fenced_code_blocks(self, text):
"""Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
@ -1620,12 +1779,17 @@ class Markdown(object):
self._escape_table[text] = hashed
return hashed
_strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S)
def _do_strike(self, text):
text = self._strike_re.sub(r"<strike>\1</strike>", text)
return text
_strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
_em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
_code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
_code_friendly_line_re = re.compile(r"\~\~(?=\S)(.+?)(?<=\S)\~\~", re.S)
_code_friendly_underline_re = re.compile(r"\~(?=\S)(.+?)(?<=\S)\~", re.S)
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
_code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
def _do_italics_and_bold(self, text):
# <strong> must go first:
if "code-friendly" in self.extras:
@ -1686,37 +1850,52 @@ class Markdown(object):
text = text.replace(". . .", "&#8230;")
return text
_block_quote_re = re.compile(r'''
_block_quote_base = r'''
( # Wrap whole match in \1
(
^[ \t]*>[ \t]? # '>' at the start of a line
^[ \t]*>%s[ \t]? # '>' at the start of a line
.+\n # rest of the first line
(.+\n)* # subsequent consecutive lines
\n* # blanks
)+
)
''', re.M | re.X)
_bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
'''
_block_quote_re = re.compile(_block_quote_base % '', re.M | re.X)
_block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M | re.X)
_bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M)
_bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M)
_bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M)
_html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
def _dedent_two_spaces_sub(self, match):
return re.sub(r'(?m)^ ', '', match.group(1))
def _block_quote_sub(self, match):
bq = match.group(1)
bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting
bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines
is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq)
# trim one level of quoting
if is_spoiler:
bq = self._bq_one_level_re_spoiler.sub('', bq)
else:
bq = self._bq_one_level_re.sub('', bq)
# trim whitespace-only lines
bq = self._ws_only_line_re.sub('', bq)
bq = self._run_block_gamut(bq) # recurse
bq = re.sub('(?m)^', ' ', bq)
# These leading spaces screw with <pre> content, so we need to fix that:
bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
return "<blockquote>\n%s\n</blockquote>\n\n" % bq
if is_spoiler:
return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq
else:
return '<blockquote>\n%s\n</blockquote>\n\n' % bq
def _do_block_quotes(self, text):
if '>' not in text:
return text
if 'spoiler' in self.extras:
return self._block_quote_re_spoiler.sub(self._block_quote_sub, text)
else:
return self._block_quote_re.sub(self._block_quote_sub, text)
def _form_paragraphs(self, text):
@ -1774,7 +1953,7 @@ class Markdown(object):
'&#8617;</a>' % (id, i+1))
if footer[-1].endswith("</p>"):
footer[-1] = footer[-1][:-len("</p>")] \
+ '&nbsp;' + backlink + "</p>"
+ '&#160;' + backlink + "</p>"
else:
footer.append("\n<p>%s</p>" % backlink)
footer.append('</li>')
@ -1910,7 +2089,7 @@ class MarkdownWithExtras(Markdown):
extras = ["footnotes", "code-color"]
#---- internal support functions
# ---- internal support functions
class UnicodeWithAttrs(unicode):
"""A subclass of unicode used for the return value of conversion to
@ -1979,6 +2158,7 @@ def _curry(*args, **kwargs):
return function(*args + rest, **combined)
return result
# Recipe: regex_from_encoded_pattern (1.0)
def _regex_from_encoded_pattern(s):
"""'foo' -> re.compile(re.escape('foo'))
@ -2008,6 +2188,7 @@ def _regex_from_encoded_pattern(s):
else: # not an encoded regex
return re.compile(re.escape(s))
# Recipe: dedent (0.1.2)
def _dedentlines(lines, tabsize=8, skip_first_line=False):
"""_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
@ -2025,7 +2206,6 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
if DEBUG:
print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
% (tabsize, skip_first_line))
indents = []
margin = None
for i, line in enumerate(lines):
if i == 0 and skip_first_line: continue
@ -2079,6 +2259,7 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
lines[i] = lines[i][removed:]
return lines
def _dedent(text, tabsize=8, skip_first_line=False):
"""_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
@ -2105,6 +2286,7 @@ class _memoized(object):
def __init__(self, func):
self.func = func
self.cache = {}
def __call__(self, *args):
try:
return self.cache[args]
@ -2115,6 +2297,7 @@ class _memoized(object):
# uncachable -- for instance, passing a list as an argument.
# Better to not cache than to blow up entirely.
return self.func(*args)
def __repr__(self):
"""Return the function's docstring."""
return self.func.__doc__
@ -2141,6 +2324,7 @@ def _xml_oneliner_re_from_tab_width(tab_width):
""" % (tab_width - 1), re.X)
_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
def _hr_tag_re_from_tab_width(tab_width):
return re.compile(r"""
(?:
@ -2191,18 +2375,19 @@ def _xml_encode_email_char_at_random(ch):
return '&#%s;' % ord(ch)
#---- mainline
# ---- mainline
class _NoReflowFormatter(optparse.IndentedHelpFormatter):
"""An optparse formatter that does NOT reflow the description."""
def format_description(self, description):
return description or ""
def _test():
import doctest
doctest.testmod()
def main(argv=None):
if argv is None:
argv = sys.argv
@ -2319,7 +2504,7 @@ def main(argv=None):
sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
if extras and "toc" in extras:
log.debug("toc_html: " +
html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
str(html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')))
if opts.compare:
test_dir = join(dirname(dirname(abspath(__file__))), "test")
if exists(join(test_dir, "test_markdown2.py")):
@ -2334,4 +2519,4 @@ def main(argv=None):
if __name__ == "__main__":
sys.exit( main(sys.argv) )
sys.exit(main(sys.argv))