Update Markdown parser from 2.1.1 to 2.3.2
This commit is contained in:
parent
4b642fa48a
commit
7cb4f1d3d7
|
@ -53,8 +53,9 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
|
|||
* header-ids: Adds "id" attributes to headers. The id value is a slug of
|
||||
the header text.
|
||||
* html-classes: Takes a dict mapping html tag names (lowercase) to a
|
||||
string to use for a "class" tag attribute. Currently only supports
|
||||
"pre" and "code" tags. Add an issue if you require this for other tags.
|
||||
string to use for a "class" tag attribute. Currently only supports "img",
|
||||
"table", "pre" and "code" tags. Add an issue if you require this for other
|
||||
tags.
|
||||
* markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
|
||||
have markdown processing be done on its contents. Similar to
|
||||
<http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
|
||||
|
@ -70,9 +71,14 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
|
|||
* smarty-pants: Replaces ' and " with curly quotation marks or curly
|
||||
apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,
|
||||
and ellipses.
|
||||
* spoiler: A special kind of blockquote commonly hidden behind a
|
||||
click on SO. Syntax per <http://meta.stackexchange.com/a/72878>.
|
||||
* toc: The returned HTML string gets a new "toc_html" attribute which is
|
||||
a Table of Contents for the document. (experimental)
|
||||
* xml: Passes one-liner processing instructions and namespaced XML tags.
|
||||
* tables: Tables using the same format as GFM
|
||||
<https://help.github.com/articles/github-flavored-markdown#tables> and
|
||||
PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>.
|
||||
* wiki-tables: Google Code Wiki-style tables. See
|
||||
<http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
|
||||
"""
|
||||
|
@ -82,13 +88,11 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
|
|||
# not yet sure if there implications with this. Compare 'pydoc sre'
|
||||
# and 'perldoc perlre'.
|
||||
|
||||
__version_info__ = (2, 1, 1)
|
||||
__version_info__ = (2, 3, 2)
|
||||
__version__ = '.'.join(map(str, __version_info__))
|
||||
__author__ = "Trent Mick"
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pprint import pprint
|
||||
import re
|
||||
import logging
|
||||
try:
|
||||
|
@ -100,15 +104,9 @@ from random import random, randint
|
|||
import codecs
|
||||
|
||||
|
||||
#---- Python version compat
|
||||
# ---- Python version compat
|
||||
|
||||
try:
|
||||
from urllib.parse import quote # python3
|
||||
except ImportError:
|
||||
from urllib import quote # python2
|
||||
|
||||
if sys.version_info[:2] < (2,4):
|
||||
from sets import Set as set
|
||||
if sys.version_info[:2] < (2, 4):
|
||||
def reversed(sequence):
|
||||
for i in sequence[::-1]:
|
||||
yield i
|
||||
|
@ -127,8 +125,7 @@ elif sys.version_info[0] >= 3:
|
|||
base_string_type = str
|
||||
|
||||
|
||||
|
||||
#---- globals
|
||||
# ---- globals
|
||||
|
||||
DEBUG = False
|
||||
log = logging.getLogger("markdown")
|
||||
|
@ -145,15 +142,12 @@ g_escape_table = dict([(ch, _hash_text(ch))
|
|||
for ch in '\\`*_{}[]()>#+-.!'])
|
||||
|
||||
|
||||
|
||||
#---- exceptions
|
||||
|
||||
# ---- exceptions
|
||||
class MarkdownError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
#---- public api
|
||||
# ---- public api
|
||||
|
||||
def markdown_path(path, encoding="utf-8",
|
||||
html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
|
||||
|
@ -167,6 +161,7 @@ def markdown_path(path, encoding="utf-8",
|
|||
link_patterns=link_patterns,
|
||||
use_file_vars=use_file_vars).convert(text)
|
||||
|
||||
|
||||
def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
|
||||
safe_mode=None, extras=None, link_patterns=None,
|
||||
use_file_vars=False):
|
||||
|
@ -175,6 +170,7 @@ def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
|
|||
link_patterns=link_patterns,
|
||||
use_file_vars=use_file_vars).convert(text)
|
||||
|
||||
|
||||
class Markdown(object):
|
||||
# The dict of "extras" to enable in processing -- a mapping of
|
||||
# extra name to argument for the extra. Most extras do not have an
|
||||
|
@ -222,7 +218,7 @@ class Markdown(object):
|
|||
extras = dict([(e, None) for e in extras])
|
||||
self.extras.update(extras)
|
||||
assert isinstance(self.extras, dict)
|
||||
if "toc" in self.extras and not "header-ids" in self.extras:
|
||||
if "toc" in self.extras and "header-ids" not in self.extras:
|
||||
self.extras["header-ids"] = None # "toc" implies "header-ids"
|
||||
self._instance_extras = self.extras.copy()
|
||||
|
||||
|
@ -254,6 +250,11 @@ class Markdown(object):
|
|||
# should only be used in <a> tags with an "href" attribute.
|
||||
_a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE)
|
||||
|
||||
# Opens the linked document in a new window or tab
|
||||
# should only used in <a> tags with an "target" attribute.
|
||||
# same with _a_nofollow
|
||||
_a_blank = _a_nofollow
|
||||
|
||||
def convert(self, text):
|
||||
"""Convert the given text."""
|
||||
# Main function. The order in which other subs are called here is
|
||||
|
@ -268,7 +269,7 @@ class Markdown(object):
|
|||
self.reset()
|
||||
|
||||
if not isinstance(text, unicode):
|
||||
#TODO: perhaps shouldn't presume UTF-8 for string input?
|
||||
# TODO: perhaps shouldn't presume UTF-8 for string input?
|
||||
text = unicode(text, 'utf-8')
|
||||
|
||||
if self.use_file_vars:
|
||||
|
@ -288,7 +289,8 @@ class Markdown(object):
|
|||
self.extras[ename] = earg
|
||||
|
||||
# Standardize line endings:
|
||||
text = re.sub("\r\n|\r", "\n", text)
|
||||
text = text.replace("\r\n", "\n")
|
||||
text = text.replace("\r", "\n")
|
||||
|
||||
# Make sure $text ends with a couple of newlines:
|
||||
text += "\n\n"
|
||||
|
@ -308,13 +310,16 @@ class Markdown(object):
|
|||
|
||||
text = self.preprocess(text)
|
||||
|
||||
if "fenced-code-blocks" in self.extras and not self.safe_mode:
|
||||
text = self._do_fenced_code_blocks(text)
|
||||
|
||||
if self.safe_mode:
|
||||
text = self._hash_html_spans(text)
|
||||
|
||||
# Turn block-level HTML blocks into hash entries
|
||||
text = self._hash_html_blocks(text, raw=True)
|
||||
|
||||
if "fenced-code-blocks" in self.extras:
|
||||
if "fenced-code-blocks" in self.extras and self.safe_mode:
|
||||
text = self._do_fenced_code_blocks(text)
|
||||
|
||||
# Strip link definitions, store in hashes.
|
||||
|
@ -340,6 +345,9 @@ class Markdown(object):
|
|||
if "nofollow" in self.extras:
|
||||
text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text)
|
||||
|
||||
if "target-blank-links" in self.extras:
|
||||
text = self._a_blank.sub(r'<\1 target="_blank"\2', text)
|
||||
|
||||
text += "\n"
|
||||
|
||||
rv = UnicodeWithAttrs(text)
|
||||
|
@ -363,18 +371,29 @@ class Markdown(object):
|
|||
"""
|
||||
return text
|
||||
|
||||
# Is metadata if the content starts with '---'-fenced `key: value`
|
||||
# Is metadata if the content starts with optional '---'-fenced `key: value`
|
||||
# pairs. E.g. (indented for presentation):
|
||||
# ---
|
||||
# foo: bar
|
||||
# another-var: blah blah
|
||||
# ---
|
||||
_metadata_pat = re.compile("""^---[ \t]*\n((?:[ \t]*[^ \t:]+[ \t]*:[^\n]*\n)+)---[ \t]*\n""")
|
||||
# # header
|
||||
# or:
|
||||
# foo: bar
|
||||
# another-var: blah blah
|
||||
#
|
||||
# # header
|
||||
|
||||
_metadata_pat = re.compile(r"""
|
||||
^
|
||||
(?:---[\ \t]*\n)? # optional "---"
|
||||
((?:[ \t]*[^ \t:]+[\ \t]*:[^\n]*\n)+) # "key: value" pairs
|
||||
(?:---[ \t]*)? # optional "---"
|
||||
\n""",
|
||||
re.VERBOSE
|
||||
)
|
||||
|
||||
def _extract_metadata(self, text):
|
||||
# fast test
|
||||
if not text.startswith("---"):
|
||||
return text
|
||||
match = self._metadata_pat.match(text)
|
||||
if not match:
|
||||
return text
|
||||
|
@ -387,7 +406,6 @@ class Markdown(object):
|
|||
|
||||
return tail
|
||||
|
||||
|
||||
_emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
|
||||
# This regular expression is intended to match blocks like this:
|
||||
# PREFIX Local Variables: SUFFIX
|
||||
|
@ -448,7 +466,7 @@ class Markdown(object):
|
|||
prefix = match.group("prefix")
|
||||
suffix = match.group("suffix")
|
||||
lines = match.group("content").splitlines(0)
|
||||
#print "prefix=%r, suffix=%r, content=%r, lines: %s"\
|
||||
# print "prefix=%r, suffix=%r, content=%r, lines: %s"\
|
||||
# % (prefix, suffix, match.group("content"), lines)
|
||||
|
||||
# Validate the Local Variables block: proper prefix and suffix
|
||||
|
@ -505,14 +523,19 @@ class Markdown(object):
|
|||
|
||||
return emacs_vars
|
||||
|
||||
# Cribbed from a post by Bart Lateur:
|
||||
# <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
|
||||
_detab_re = re.compile(r'(.*?)\t', re.M)
|
||||
def _detab_sub(self, match):
|
||||
g1 = match.group(1)
|
||||
return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
|
||||
def _detab_line(self, line):
|
||||
r"""Recusively convert tabs to spaces in a single line.
|
||||
|
||||
Called from _detab()."""
|
||||
if '\t' not in line:
|
||||
return line
|
||||
chunk1, chunk2 = line.split('\t', 1)
|
||||
chunk1 += (' ' * (self.tab_width - len(chunk1) % self.tab_width))
|
||||
output = chunk1 + chunk2
|
||||
return self._detab_line(output)
|
||||
|
||||
def _detab(self, text):
|
||||
r"""Remove (leading?) tabs from a file.
|
||||
r"""Iterate text line by line and convert tabs to spaces.
|
||||
|
||||
>>> m = Markdown()
|
||||
>>> m._detab("\tfoo")
|
||||
|
@ -528,7 +551,10 @@ class Markdown(object):
|
|||
"""
|
||||
if '\t' not in text:
|
||||
return text
|
||||
return self._detab_re.subn(self._detab_sub, text)[0]
|
||||
output = []
|
||||
for line in text.splitlines():
|
||||
output.append(self._detab_line(line))
|
||||
return '\n'.join(output)
|
||||
|
||||
# I broke out the html5 tags here and add them to _block_tags_a and
|
||||
# _block_tags_b. This way html5 tags are easy to keep track of.
|
||||
|
@ -776,12 +802,7 @@ class Markdown(object):
|
|||
re.X | re.M)
|
||||
return footnote_def_re.sub(self._extract_footnote_def_sub, text)
|
||||
|
||||
|
||||
_hr_data = [
|
||||
('*', re.compile(r"^[ ]{0,3}\*(.*?)$", re.M)),
|
||||
('-', re.compile(r"^[ ]{0,3}\-(.*?)$", re.M)),
|
||||
('_', re.compile(r"^[ ]{0,3}\_(.*?)$", re.M)),
|
||||
]
|
||||
_hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M)
|
||||
|
||||
def _run_block_gamut(self, text):
|
||||
# These are all the transformations that form block-level
|
||||
|
@ -798,13 +819,7 @@ class Markdown(object):
|
|||
# Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
|
||||
# hr chars to one or two. We'll reproduce that limit here.
|
||||
hr = "\n<hr"+self.empty_element_suffix+"\n"
|
||||
for ch, regex in self._hr_data:
|
||||
if ch in text:
|
||||
for m in reversed(list(regex.finditer(text))):
|
||||
tail = m.group(1).rstrip()
|
||||
if not tail.strip(ch + ' ') and tail.count(" ") == 0:
|
||||
start, end = m.span()
|
||||
text = text[:start] + hr + text[end:]
|
||||
text = re.sub(self._hr_re, hr, text)
|
||||
|
||||
text = self._do_lists(text)
|
||||
|
||||
|
@ -812,6 +827,8 @@ class Markdown(object):
|
|||
text = self._prepare_pyshell_blocks(text)
|
||||
if "wiki-tables" in self.extras:
|
||||
text = self._do_wiki_tables(text)
|
||||
if "tables" in self.extras:
|
||||
text = self._do_tables(text)
|
||||
|
||||
text = self._do_code_blocks(text)
|
||||
|
||||
|
@ -852,16 +869,89 @@ class Markdown(object):
|
|||
|
||||
return _pyshell_block_re.sub(self._pyshell_block_sub, text)
|
||||
|
||||
def _table_sub(self, match):
|
||||
trim_space_re = '^[ \t\n]+|[ \t\n]+$'
|
||||
trim_bar_re = '^\||\|$'
|
||||
|
||||
head, underline, body = match.groups()
|
||||
|
||||
# Determine aligns for columns.
|
||||
cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)).split('|')]
|
||||
align_from_col_idx = {}
|
||||
for col_idx, col in enumerate(cols):
|
||||
if col[0] == ':' and col[-1] == ':':
|
||||
align_from_col_idx[col_idx] = ' align="center"'
|
||||
elif col[0] == ':':
|
||||
align_from_col_idx[col_idx] = ' align="left"'
|
||||
elif col[-1] == ':':
|
||||
align_from_col_idx[col_idx] = ' align="right"'
|
||||
|
||||
# thead
|
||||
hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<thead>', '<tr>']
|
||||
cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)).split('|')]
|
||||
for col_idx, col in enumerate(cols):
|
||||
hlines.append(' <th%s>%s</th>' % (
|
||||
align_from_col_idx.get(col_idx, ''),
|
||||
self._run_span_gamut(col)
|
||||
))
|
||||
hlines.append('</tr>')
|
||||
hlines.append('</thead>')
|
||||
|
||||
# tbody
|
||||
hlines.append('<tbody>')
|
||||
for line in body.strip('\n').split('\n'):
|
||||
hlines.append('<tr>')
|
||||
cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)).split('|')]
|
||||
for col_idx, col in enumerate(cols):
|
||||
hlines.append(' <td%s>%s</td>' % (
|
||||
align_from_col_idx.get(col_idx, ''),
|
||||
self._run_span_gamut(col)
|
||||
))
|
||||
hlines.append('</tr>')
|
||||
hlines.append('</tbody>')
|
||||
hlines.append('</table>')
|
||||
|
||||
return '\n'.join(hlines) + '\n'
|
||||
|
||||
def _do_tables(self, text):
|
||||
"""Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
|
||||
https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
|
||||
"""
|
||||
less_than_tab = self.tab_width - 1
|
||||
table_re = re.compile(r'''
|
||||
(?:(?<=\n\n)|\A\n?) # leading blank line
|
||||
|
||||
^[ ]{0,%d} # allowed whitespace
|
||||
(.*[|].*) \n # $1: header row (at least one pipe)
|
||||
|
||||
^[ ]{0,%d} # allowed whitespace
|
||||
( # $2: underline row
|
||||
# underline row with leading bar
|
||||
(?: \|\ *:?-+:?\ * )+ \|? \n
|
||||
|
|
||||
# or, underline row without leading bar
|
||||
(?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \n
|
||||
)
|
||||
|
||||
( # $3: data rows
|
||||
(?:
|
||||
^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces
|
||||
.*\|.* \n
|
||||
)+
|
||||
)
|
||||
''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
|
||||
return table_re.sub(self._table_sub, text)
|
||||
|
||||
def _wiki_table_sub(self, match):
|
||||
ttext = match.group(0).strip()
|
||||
#print 'wiki table: %r' % match.group(0)
|
||||
# print 'wiki table: %r' % match.group(0)
|
||||
rows = []
|
||||
for line in ttext.splitlines(0):
|
||||
line = line.strip()[2:-2].strip()
|
||||
row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
|
||||
rows.append(row)
|
||||
#pprint(rows)
|
||||
hlines = ['<table>', '<tbody>']
|
||||
# pprint(rows)
|
||||
hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<tbody>']
|
||||
for row in rows:
|
||||
hrow = ['<tr>']
|
||||
for cell in row:
|
||||
|
@ -907,12 +997,18 @@ class Markdown(object):
|
|||
|
||||
text = self._encode_amps_and_angles(text)
|
||||
|
||||
if "strike" in self.extras:
|
||||
text = self._do_strike(text)
|
||||
|
||||
text = self._do_italics_and_bold(text)
|
||||
|
||||
if "smarty-pants" in self.extras:
|
||||
text = self._do_smart_punctuation(text)
|
||||
|
||||
# Do hard breaks:
|
||||
if "break-on-newline" in self.extras:
|
||||
text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text)
|
||||
else:
|
||||
text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
|
||||
|
||||
return text
|
||||
|
@ -1003,22 +1099,14 @@ class Markdown(object):
|
|||
raise MarkdownError("invalid value for 'safe_mode': %r (must be "
|
||||
"'escape' or 'replace')" % self.safe_mode)
|
||||
|
||||
_tail_of_inline_link_re = re.compile(r'''
|
||||
# Match tail of: [text](/url/) or [text](/url/ "title")
|
||||
\( # literal paren
|
||||
[ \t]*
|
||||
(?P<url> # \1
|
||||
<.*?>
|
||||
|
|
||||
.*?
|
||||
)
|
||||
[ \t]*
|
||||
( # \2
|
||||
(['"]) # quote char = \3
|
||||
_inline_link_title = re.compile(r'''
|
||||
( # \1
|
||||
[ \t]+
|
||||
(['"]) # quote char = \2
|
||||
(?P<title>.*?)
|
||||
\3 # matching quote
|
||||
\2
|
||||
)? # title is optional
|
||||
\)
|
||||
\)$
|
||||
''', re.X | re.S)
|
||||
_tail_of_reference_link_re = re.compile(r'''
|
||||
# Match tail of: [text][id]
|
||||
|
@ -1029,6 +1117,52 @@ class Markdown(object):
|
|||
\]
|
||||
''', re.X | re.S)
|
||||
|
||||
_whitespace = re.compile(r'\s*')
|
||||
|
||||
_strip_anglebrackets = re.compile(r'<(.*)>.*')
|
||||
|
||||
def _find_non_whitespace(self, text, start):
|
||||
"""Returns the index of the first non-whitespace character in text
|
||||
after (and including) start
|
||||
"""
|
||||
match = self._whitespace.match(text, start)
|
||||
return match.end()
|
||||
|
||||
def _find_balanced(self, text, start, open_c, close_c):
|
||||
"""Returns the index where the open_c and close_c characters balance
|
||||
out - the same number of open_c and close_c are encountered - or the
|
||||
end of string if it's reached before the balance point is found.
|
||||
"""
|
||||
i = start
|
||||
l = len(text)
|
||||
count = 1
|
||||
while count > 0 and i < l:
|
||||
if text[i] == open_c:
|
||||
count += 1
|
||||
elif text[i] == close_c:
|
||||
count -= 1
|
||||
i += 1
|
||||
return i
|
||||
|
||||
def _extract_url_and_title(self, text, start):
|
||||
"""Extracts the url and (optional) title from the tail of a link"""
|
||||
# text[start] equals the opening parenthesis
|
||||
idx = self._find_non_whitespace(text, start+1)
|
||||
if idx == len(text):
|
||||
return None, None, None
|
||||
end_idx = idx
|
||||
has_anglebrackets = text[idx] == "<"
|
||||
if has_anglebrackets:
|
||||
end_idx = self._find_balanced(text, end_idx+1, "<", ">")
|
||||
end_idx = self._find_balanced(text, end_idx, "(", ")")
|
||||
match = self._inline_link_title.search(text, idx, end_idx)
|
||||
if not match:
|
||||
return None, None, None
|
||||
url, title = text[idx:match.start()], match.group("title")
|
||||
if has_anglebrackets:
|
||||
url = self._strip_anglebrackets.sub(r'\1', url)
|
||||
return url, title, end_idx
|
||||
|
||||
def _do_links(self, text):
|
||||
"""Turn Markdown link shortcuts into XHTML <a> and <img> tags.
|
||||
|
||||
|
@ -1111,8 +1245,8 @@ class Markdown(object):
|
|||
|
||||
# Inline anchor or img?
|
||||
if text[p] == '(': # attempt at perf improvement
|
||||
match = self._tail_of_inline_link_re.match(text, p)
|
||||
if match:
|
||||
url, title, url_end_idx = self._extract_url_and_title(text, p)
|
||||
if url is not None:
|
||||
# Handle an inline anchor or img.
|
||||
is_img = start_idx > 0 and text[start_idx-1] == "!"
|
||||
if is_img:
|
||||
|
@ -1123,9 +1257,6 @@ class Markdown(object):
|
|||
start_idx -= 1
|
||||
is_img = 1
|
||||
|
||||
url, title = match.group("url"), match.group("title")
|
||||
if url and url[0] == '<':
|
||||
url = url[1:-1] # '<url>' -> 'url'
|
||||
# We've got to encode these to avoid conflicting
|
||||
# with italics/bold.
|
||||
url = url.replace('*', self._escape_table['*']) \
|
||||
|
@ -1138,20 +1269,17 @@ class Markdown(object):
|
|||
else:
|
||||
title_str = ''
|
||||
if is_img:
|
||||
img_class_str = self._html_class_str_from_tag("img")
|
||||
if is_inline_img:
|
||||
result = '<img class="inlineimage" src="%s" alt="%s"%s%s' \
|
||||
img_class_str = ' class="inlineimage"'
|
||||
result = '<img src="%s" alt="%s"%s%s%s' \
|
||||
% (url.replace('"', '"'),
|
||||
_xml_escape_attr(link_text),
|
||||
title_str, self.empty_element_suffix)
|
||||
else:
|
||||
result = '<img src="%s" alt="%s"%s%s' \
|
||||
% (url.replace('"', '"'),
|
||||
_xml_escape_attr(link_text),
|
||||
title_str, self.empty_element_suffix)
|
||||
title_str, img_class_str, self.empty_element_suffix)
|
||||
if "smarty-pants" in self.extras:
|
||||
result = result.replace('"', self._escape_table['"'])
|
||||
curr_pos = start_idx + len(result)
|
||||
text = text[:start_idx] + result + text[match.end():]
|
||||
text = text[:start_idx] + result + text[url_end_idx:]
|
||||
elif start_idx >= anchor_allowed_pos:
|
||||
result_head = '<a href="%s"%s>' % (url, title_str)
|
||||
result = '%s%s</a>' % (result_head, link_text)
|
||||
|
@ -1161,7 +1289,7 @@ class Markdown(object):
|
|||
# anchor_allowed_pos on.
|
||||
curr_pos = start_idx + len(result_head)
|
||||
anchor_allowed_pos = start_idx + len(result)
|
||||
text = text[:start_idx] + result + text[match.end():]
|
||||
text = text[:start_idx] + result + text[url_end_idx:]
|
||||
else:
|
||||
# Anchor not allowed here.
|
||||
curr_pos = start_idx + 1
|
||||
|
@ -1186,7 +1314,6 @@ class Markdown(object):
|
|||
.replace('_', self._escape_table['_'])
|
||||
title = self.titles.get(link_id)
|
||||
if title:
|
||||
before = title
|
||||
title = _xml_escape_attr(title) \
|
||||
.replace('*', self._escape_table['*']) \
|
||||
.replace('_', self._escape_table['_'])
|
||||
|
@ -1194,10 +1321,11 @@ class Markdown(object):
|
|||
else:
|
||||
title_str = ''
|
||||
if is_img:
|
||||
result = '<img src="%s" alt="%s"%s%s' \
|
||||
img_class_str = self._html_class_str_from_tag("img")
|
||||
result = '<img src="%s" alt="%s"%s%s%s' \
|
||||
% (url.replace('"', '"'),
|
||||
link_text.replace('"', '"'),
|
||||
title_str, self.empty_element_suffix)
|
||||
title_str, img_class_str, self.empty_element_suffix)
|
||||
if "smarty-pants" in self.extras:
|
||||
result = result.replace('"', self._escape_table['"'])
|
||||
curr_pos = start_idx + len(result)
|
||||
|
@ -1258,44 +1386,42 @@ class Markdown(object):
|
|||
self._toc = []
|
||||
self._toc.append((level, id, self._unescape_special_chars(name)))
|
||||
|
||||
_setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
|
||||
def _setext_h_sub(self, match):
|
||||
n = {"=": 1, "-": 2}[match.group(2)[0]]
|
||||
demote_headers = self.extras.get("demote-headers")
|
||||
if demote_headers:
|
||||
n = min(n + demote_headers, 6)
|
||||
header_id_attr = ""
|
||||
if "header-ids" in self.extras:
|
||||
header_id = self.header_id_from_text(match.group(1),
|
||||
self.extras["header-ids"], n)
|
||||
if header_id:
|
||||
header_id_attr = ' id="%s"' % header_id
|
||||
html = self._run_span_gamut(match.group(1))
|
||||
if "toc" in self.extras and header_id:
|
||||
self._toc_add_entry(n, header_id, html)
|
||||
return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
|
||||
|
||||
_atx_h_re = re.compile(r'''
|
||||
^(\#{1,6}) # \1 = string of #'s
|
||||
[ \t]+
|
||||
_h_re_base = r'''
|
||||
(^(.+)[ \t]*\n(=+|-+)[ \t]*\n+)
|
||||
|
|
||||
(^(\#{1,6}) # \1 = string of #'s
|
||||
[ \t]%s
|
||||
(.+?) # \2 = Header text
|
||||
[ \t]*
|
||||
(?<!\\) # ensure not an escaped trailing '#'
|
||||
\#* # optional closing #'s (not counted)
|
||||
\n+
|
||||
''', re.X | re.M)
|
||||
def _atx_h_sub(self, match):
|
||||
n = len(match.group(1))
|
||||
)
|
||||
'''
|
||||
|
||||
_h_re = re.compile(_h_re_base % '*', re.X | re.M)
|
||||
_h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)
|
||||
|
||||
def _h_sub(self, match):
|
||||
if match.group(1) is not None:
|
||||
# Setext header
|
||||
n = {"=": 1, "-": 2}[match.group(3)[0]]
|
||||
header_group = match.group(2)
|
||||
else:
|
||||
# atx header
|
||||
n = len(match.group(5))
|
||||
header_group = match.group(6)
|
||||
|
||||
demote_headers = self.extras.get("demote-headers")
|
||||
if demote_headers:
|
||||
n = min(n + demote_headers, 6)
|
||||
header_id_attr = ""
|
||||
if "header-ids" in self.extras:
|
||||
header_id = self.header_id_from_text(match.group(2),
|
||||
header_id = self.header_id_from_text(header_group,
|
||||
self.extras["header-ids"], n)
|
||||
if header_id:
|
||||
header_id_attr = ' id="%s"' % header_id
|
||||
html = self._run_span_gamut(match.group(2))
|
||||
html = self._run_span_gamut(header_group)
|
||||
if "toc" in self.extras and header_id:
|
||||
self._toc_add_entry(n, header_id, html)
|
||||
return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
|
||||
|
@ -1307,7 +1433,6 @@ class Markdown(object):
|
|||
#
|
||||
# Header 2
|
||||
# --------
|
||||
text = self._setext_h_re.sub(self._setext_h_sub, text)
|
||||
|
||||
# atx-style headers:
|
||||
# # Header 1
|
||||
|
@ -1315,10 +1440,10 @@ class Markdown(object):
|
|||
# ## Header 2 with closing hashes ##
|
||||
# ...
|
||||
# ###### Header 6
|
||||
text = self._atx_h_re.sub(self._atx_h_sub, text)
|
||||
|
||||
return text
|
||||
|
||||
if 'tag-friendly' in self.extras:
|
||||
return self._h_re_tag_friendly.sub(self._h_sub, text)
|
||||
return self._h_re.sub(self._h_sub, text)
|
||||
|
||||
_marker_ul_chars = '*+-'
|
||||
_marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
|
||||
|
@ -1380,8 +1505,9 @@ class Markdown(object):
|
|||
hits.sort()
|
||||
match = hits[0][1]
|
||||
start, end = match.span()
|
||||
text = text[:start] + self._list_sub(match) + text[end:]
|
||||
pos = end
|
||||
middle = self._list_sub(match)
|
||||
text = text[:start] + middle + text[end:]
|
||||
pos = start + len(middle) # start pos for next attempted match
|
||||
|
||||
return text
|
||||
|
||||
|
@ -1395,11 +1521,25 @@ class Markdown(object):
|
|||
''' % (_marker_any, _marker_any),
|
||||
re.M | re.X | re.S)
|
||||
|
||||
_task_list_item_re = re.compile(r'''
|
||||
(\[[\ x]\])[ \t]+ # tasklist marker = \1
|
||||
(.*) # list item text = \2
|
||||
''', re.M | re.X | re.S)
|
||||
|
||||
_task_list_warpper_str = r'<p><input type="checkbox" class="task-list-item-checkbox" %sdisabled>%s</p>'
|
||||
|
||||
def _task_list_item_sub(self, match):
|
||||
marker = match.group(1)
|
||||
item_text = match.group(2)
|
||||
if marker == '[x]':
|
||||
return self._task_list_warpper_str % ('checked ', item_text)
|
||||
elif marker == '[ ]':
|
||||
return self._task_list_warpper_str % ('', item_text)
|
||||
|
||||
_last_li_endswith_two_eols = False
|
||||
def _list_item_sub(self, match):
|
||||
item = match.group(4)
|
||||
leading_line = match.group(1)
|
||||
leading_space = match.group(2)
|
||||
if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
|
||||
item = self._run_block_gamut(self._outdent(item))
|
||||
else:
|
||||
|
@ -1409,6 +1549,10 @@ class Markdown(object):
|
|||
item = item[:-1]
|
||||
item = self._run_span_gamut(item)
|
||||
self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
|
||||
|
||||
if "task_list" in self.extras:
|
||||
item = self._task_list_item_re.sub(self._task_list_item_sub, item)
|
||||
|
||||
return "<li>%s</li>\n" % item
|
||||
|
||||
def _process_list_items(self, list_str):
|
||||
|
@ -1497,8 +1641,20 @@ class Markdown(object):
|
|||
formatter_opts = self.extras['code-color'] or {}
|
||||
|
||||
if lexer_name:
|
||||
def unhash_code(codeblock):
|
||||
for key, sanitized in list(self.html_spans.items()):
|
||||
codeblock = codeblock.replace(key, sanitized)
|
||||
replacements = [
|
||||
("&", "&"),
|
||||
("<", "<"),
|
||||
(">", ">")
|
||||
]
|
||||
for old, new in replacements:
|
||||
codeblock = codeblock.replace(old, new)
|
||||
return codeblock
|
||||
lexer = self._get_pygments_lexer(lexer_name)
|
||||
if lexer:
|
||||
codeblock = unhash_code( codeblock )
|
||||
colored = self._color_with_pygments(codeblock, lexer,
|
||||
**formatter_opts)
|
||||
return "\n\n%s\n\n" % colored
|
||||
|
@ -1535,19 +1691,22 @@ class Markdown(object):
|
|||
)+
|
||||
)
|
||||
((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
|
||||
# Lookahead to make sure this block isn't already in a code block.
|
||||
# Needed when syntax highlighting is being used.
|
||||
(?![^<]*\</code\>)
|
||||
''' % (self.tab_width, self.tab_width),
|
||||
re.M | re.X)
|
||||
return code_block_re.sub(self._code_block_sub, text)
|
||||
|
||||
_fenced_code_block_re = re.compile(r'''
|
||||
(?:\n\n|\A\n?)
|
||||
(?:\n+|\A\n?)
|
||||
^```([\w+-]+)?[ \t]*\n # opening fence, $1 = optional lang
|
||||
(.*?) # $2 = code block content
|
||||
^```[ \t]*\n # closing fence
|
||||
''', re.M | re.X | re.S)
|
||||
|
||||
def _fenced_code_block_sub(self, match):
|
||||
return self._code_block_sub(match, is_fenced_code_block=True);
|
||||
return self._code_block_sub(match, is_fenced_code_block=True)
|
||||
|
||||
def _do_fenced_code_blocks(self, text):
|
||||
"""Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
|
||||
|
@ -1620,12 +1779,17 @@ class Markdown(object):
|
|||
self._escape_table[text] = hashed
|
||||
return hashed
|
||||
|
||||
_strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S)
|
||||
def _do_strike(self, text):
|
||||
text = self._strike_re.sub(r"<strike>\1</strike>", text)
|
||||
return text
|
||||
|
||||
_strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
|
||||
_em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
|
||||
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
|
||||
_code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
|
||||
_code_friendly_line_re = re.compile(r"\~\~(?=\S)(.+?)(?<=\S)\~\~", re.S)
|
||||
_code_friendly_underline_re = re.compile(r"\~(?=\S)(.+?)(?<=\S)\~", re.S)
|
||||
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
|
||||
_code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
|
||||
def _do_italics_and_bold(self, text):
|
||||
# <strong> must go first:
|
||||
if "code-friendly" in self.extras:
|
||||
|
@ -1686,37 +1850,52 @@ class Markdown(object):
|
|||
text = text.replace(". . .", "…")
|
||||
return text
|
||||
|
||||
_block_quote_re = re.compile(r'''
|
||||
_block_quote_base = r'''
|
||||
( # Wrap whole match in \1
|
||||
(
|
||||
^[ \t]*>[ \t]? # '>' at the start of a line
|
||||
^[ \t]*>%s[ \t]? # '>' at the start of a line
|
||||
.+\n # rest of the first line
|
||||
(.+\n)* # subsequent consecutive lines
|
||||
\n* # blanks
|
||||
)+
|
||||
)
|
||||
''', re.M | re.X)
|
||||
_bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
|
||||
|
||||
'''
|
||||
_block_quote_re = re.compile(_block_quote_base % '', re.M | re.X)
|
||||
_block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M | re.X)
|
||||
_bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M)
|
||||
_bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M)
|
||||
_bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M)
|
||||
_html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
|
||||
def _dedent_two_spaces_sub(self, match):
|
||||
return re.sub(r'(?m)^ ', '', match.group(1))
|
||||
|
||||
def _block_quote_sub(self, match):
|
||||
bq = match.group(1)
|
||||
bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting
|
||||
bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines
|
||||
is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq)
|
||||
# trim one level of quoting
|
||||
if is_spoiler:
|
||||
bq = self._bq_one_level_re_spoiler.sub('', bq)
|
||||
else:
|
||||
bq = self._bq_one_level_re.sub('', bq)
|
||||
# trim whitespace-only lines
|
||||
bq = self._ws_only_line_re.sub('', bq)
|
||||
bq = self._run_block_gamut(bq) # recurse
|
||||
|
||||
bq = re.sub('(?m)^', ' ', bq)
|
||||
# These leading spaces screw with <pre> content, so we need to fix that:
|
||||
bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
|
||||
|
||||
return "<blockquote>\n%s\n</blockquote>\n\n" % bq
|
||||
if is_spoiler:
|
||||
return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq
|
||||
else:
|
||||
return '<blockquote>\n%s\n</blockquote>\n\n' % bq
|
||||
|
||||
def _do_block_quotes(self, text):
|
||||
if '>' not in text:
|
||||
return text
|
||||
if 'spoiler' in self.extras:
|
||||
return self._block_quote_re_spoiler.sub(self._block_quote_sub, text)
|
||||
else:
|
||||
return self._block_quote_re.sub(self._block_quote_sub, text)
|
||||
|
||||
def _form_paragraphs(self, text):
|
||||
|
@ -1774,7 +1953,7 @@ class Markdown(object):
|
|||
'↩</a>' % (id, i+1))
|
||||
if footer[-1].endswith("</p>"):
|
||||
footer[-1] = footer[-1][:-len("</p>")] \
|
||||
+ ' ' + backlink + "</p>"
|
||||
+ ' ' + backlink + "</p>"
|
||||
else:
|
||||
footer.append("\n<p>%s</p>" % backlink)
|
||||
footer.append('</li>')
|
||||
|
@ -1910,7 +2089,7 @@ class MarkdownWithExtras(Markdown):
|
|||
extras = ["footnotes", "code-color"]
|
||||
|
||||
|
||||
#---- internal support functions
|
||||
# ---- internal support functions
|
||||
|
||||
class UnicodeWithAttrs(unicode):
|
||||
"""A subclass of unicode used for the return value of conversion to
|
||||
|
@ -1979,6 +2158,7 @@ def _curry(*args, **kwargs):
|
|||
return function(*args + rest, **combined)
|
||||
return result
|
||||
|
||||
|
||||
# Recipe: regex_from_encoded_pattern (1.0)
|
||||
def _regex_from_encoded_pattern(s):
|
||||
"""'foo' -> re.compile(re.escape('foo'))
|
||||
|
@ -2008,6 +2188,7 @@ def _regex_from_encoded_pattern(s):
|
|||
else: # not an encoded regex
|
||||
return re.compile(re.escape(s))
|
||||
|
||||
|
||||
# Recipe: dedent (0.1.2)
|
||||
def _dedentlines(lines, tabsize=8, skip_first_line=False):
|
||||
"""_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
|
||||
|
@ -2025,7 +2206,6 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
|
|||
if DEBUG:
|
||||
print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
|
||||
% (tabsize, skip_first_line))
|
||||
indents = []
|
||||
margin = None
|
||||
for i, line in enumerate(lines):
|
||||
if i == 0 and skip_first_line: continue
|
||||
|
@ -2079,6 +2259,7 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
|
|||
lines[i] = lines[i][removed:]
|
||||
return lines
|
||||
|
||||
|
||||
def _dedent(text, tabsize=8, skip_first_line=False):
|
||||
"""_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
|
||||
|
||||
|
@ -2105,6 +2286,7 @@ class _memoized(object):
|
|||
def __init__(self, func):
|
||||
self.func = func
|
||||
self.cache = {}
|
||||
|
||||
def __call__(self, *args):
|
||||
try:
|
||||
return self.cache[args]
|
||||
|
@ -2115,6 +2297,7 @@ class _memoized(object):
|
|||
# uncachable -- for instance, passing a list as an argument.
|
||||
# Better to not cache than to blow up entirely.
|
||||
return self.func(*args)
|
||||
|
||||
def __repr__(self):
|
||||
"""Return the function's docstring."""
|
||||
return self.func.__doc__
|
||||
|
@ -2141,6 +2324,7 @@ def _xml_oneliner_re_from_tab_width(tab_width):
|
|||
""" % (tab_width - 1), re.X)
|
||||
_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
|
||||
|
||||
|
||||
def _hr_tag_re_from_tab_width(tab_width):
|
||||
return re.compile(r"""
|
||||
(?:
|
||||
|
@ -2191,18 +2375,19 @@ def _xml_encode_email_char_at_random(ch):
|
|||
return '&#%s;' % ord(ch)
|
||||
|
||||
|
||||
|
||||
#---- mainline
|
||||
# ---- mainline
|
||||
|
||||
class _NoReflowFormatter(optparse.IndentedHelpFormatter):
|
||||
"""An optparse formatter that does NOT reflow the description."""
|
||||
def format_description(self, description):
|
||||
return description or ""
|
||||
|
||||
|
||||
def _test():
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
if argv is None:
|
||||
argv = sys.argv
|
||||
|
@ -2319,7 +2504,7 @@ def main(argv=None):
|
|||
sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
|
||||
if extras and "toc" in extras:
|
||||
log.debug("toc_html: " +
|
||||
html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
|
||||
str(html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')))
|
||||
if opts.compare:
|
||||
test_dir = join(dirname(dirname(abspath(__file__))), "test")
|
||||
if exists(join(test_dir, "test_markdown2.py")):
|
||||
|
@ -2334,4 +2519,4 @@ def main(argv=None):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit( main(sys.argv) )
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
Loading…
Reference in New Issue