Dynastie

Dynastie Commit Details

Date:2020-03-20 16:55:45 (4 months 13 days ago)
Author:Grégory Soutadé
Branch:master
Commit:7cb4f1d3d7504b6e45a2f3179f40c8b38cbcbf55
Parents: 4b642fa48a04d6b9ba86c653badac7cca0505793
Message:Update Markdown parser from 2.1.1 to 2.3.2

Changes:
Mdynastie/generators/markdown2.py (60 diffs)

File differences

dynastie/generators/markdown2.py
5353
5454
5555
56
57
56
57
58
5859
5960
6061
......
7071
7172
7273
74
75
7376
7477
7578
79
80
81
7682
7783
7884
......
8288
8389
8490
85
91
8692
8793
8894
89
9095
91
9296
9397
9498
......
100104
101105
102106
103
107
104108
105
106
107
108
109
110
111
109
112110
113111
114112
......
127125
128126
129127
130
131
128
132129
133130
134131
......
145142
146143
147144
148
149
150
145
151146
152147
153148
154149
155
156
150
157151
158152
159153
......
167161
168162
169163
164
170165
171166
172167
......
175170
176171
177172
173
178174
179175
180176
......
222218
223219
224220
225
221
226222
227223
228224
......
246242
247243
248244
249
245
250246
251247
252248
......
254250
255251
256252
253
254
255
256
257
257258
258259
259260
......
268269
269270
270271
271
272
272273
273274
274275
......
288289
289290
290291
291
292
293
292294
293295
294296
......
308310
309311
310312
313
314
315
311316
312317
313318
314319
315320
316321
317
322
318323
319324
320325
......
340345
341346
342347
348
349
350
343351
344352
345353
......
363371
364372
365373
366
374
367375
368376
369377
370378
371379
372
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
373395
374396
375
376
377
378397
379398
380399
......
387406
388407
389408
390
391409
392410
393411
......
412430
413431
414432
415
433
416434
417435
418436
......
448466
449467
450468
451
469
452470
453471
454472
......
469487
470488
471489
472
473
474
490
491
492
475493
476494
477495
......
505523
506524
507525
508
509
510
511
512
513
526
527
528
529
530
531
532
533
534
535
536
514537
515
538
516539
517540
518541
......
528551
529552
530553
531
554
555
556
557
532558
533559
534560
......
776802
777803
778804
779
780
781
782
783
784
805
785806
786807
787808
......
798819
799820
800821
801
802
803
804
805
806
807
822
808823
809824
810825
......
812827
813828
814829
830
831
815832
816833
817834
......
831848
832849
833850
834
851
835852
836853
837854
......
852869
853870
854871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
855945
856946
857
947
858948
859949
860950
861951
862952
863
864
953
954
865955
866956
867957
......
907997
908998
909999
1000
1001
1002
9101003
9111004
9121005
9131006
9141007
9151008
916
1009
1010
1011
1012
9171013
9181014
9191015
......
10031099
10041100
10051101
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1102
1103
1104
1105
10181106
1019
1107
10201108
1021
1109
10221110
10231111
10241112
......
10291117
10301118
10311119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
10321166
10331167
10341168
......
10461180
10471181
10481182
1049
1183
10501184
10511185
10521186
......
11101244
11111245
11121246
1113
1114
1115
1247
1248
1249
11161250
11171251
11181252
......
11201254
11211255
11221256
1123
1124
1257
1258
11251259
1126
1127
1128
11291260
11301261
11311262
......
11381269
11391270
11401271
1272
11411273
1142
1143
1144
1145
1146
1147
1148
1149
1150
1274
1275
1276
1277
1278
11511279
11521280
11531281
1154
1282
11551283
11561284
11571285
......
11611289
11621290
11631291
1164
1292
11651293
11661294
11671295
......
11861314
11871315
11881316
1189
11901317
11911318
11921319
......
11941321
11951322
11961323
1197
1324
1325
11981326
11991327
1200
1328
12011329
12021330
12031331
......
12581386
12591387
12601388
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1389
1390
1391
1392
1393
12811394
12821395
12831396
12841397
12851398
1286
1287
1288
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
12891415
12901416
12911417
12921418
12931419
1294
1420
12951421
12961422
12971423
1298
1424
12991425
13001426
13011427
......
13071433
13081434
13091435
1310
13111436
13121437
13131438
......
13151440
13161441
13171442
1318
1319
1320
13211443
1444
1445
1446
13221447
1323
1448
13241449
13251450
13261451
......
13801505
13811506
13821507
1383
1384
1508
1509
1510
13851511
13861512
13871513
......
13901516
13911517
13921518
1393
1519
13941520
13951521
13961522
13971523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
13981539
13991540
14001541
14011542
1402
14031543
14041544
14051545
......
14091549
14101550
14111551
1552
1553
1554
1555
14121556
14131557
14141558
......
14971641
14981642
14991643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
15001655
15011656
1657
15021658
15031659
15041660
......
15351691
15361692
15371693
1694
1695
1696
15381697
15391698
15401699
15411700
15421701
1543
1702
15441703
15451704
15461705
15471706
15481707
15491708
1550
1709
15511710
15521711
15531712
......
16201779
16211780
16221781
1782
1783
1784
1785
1786
16231787
16241788
1625
1626
16271789
16281790
1791
1792
16291793
16301794
16311795
......
16701834
16711835
16721836
1673
1837
16741838
16751839
16761840
16771841
1678
1842
16791843
16801844
16811845
......
16861850
16871851
16881852
1689
1853
16901854
16911855
1692
1856
16931857
16941858
16951859
16961860
16971861
1698
1699
1700
1862
1863
1864
1865
1866
1867
17011868
17021869
17031870
17041871
17051872
17061873
1707
1708
1874
1875
1876
1877
1878
1879
1880
1881
17091882
17101883
17111884
17121885
17131886
17141887
1715
1888
1889
1890
1891
17161892
17171893
17181894
17191895
1720
1896
1897
1898
1899
17211900
17221901
17231902
......
17741953
17751954
17761955
1777
1956
17781957
17791958
17801959
......
19102089
19112090
19122091
1913
2092
19142093
19152094
19162095
......
19792158
19802159
19812160
2161
19822162
19832163
19842164
......
20052185
20062186
20072187
2008
2188
20092189
20102190
2191
20112192
20122193
20132194
......
20252206
20262207
20272208
2028
20292209
20302210
20312211
......
20362216
20372217
20382218
2039
2219
20402220
20412221
20422222
2043
2223
20442224
20452225
20462226
......
20792259
20802260
20812261
2262
20822263
20832264
20842265
......
20962277
20972278
20982279
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
21212304
21222305
21232306
......
21412324
21422325
21432326
2327
21442328
2145
2329
21462330
21472331
21482332
......
21912375
21922376
21932377
2194
2195
2378
21962379
21972380
21982381
21992382
22002383
22012384
2385
22022386
22032387
22042388
22052389
2390
22062391
22072392
22082393
......
23192504
23202505
23212506
2322
2507
23232508
23242509
23252510
......
23342519
23352520
23362521
2337
2522
* header-ids: Adds "id" attributes to headers. The id value is a slug of
the header text.
* html-classes: Takes a dict mapping html tag names (lowercase) to a
string to use for a "class" tag attribute. Currently only supports
"pre" and "code" tags. Add an issue if you require this for other tags.
string to use for a "class" tag attribute. Currently only supports "img",
"table", "pre" and "code" tags. Add an issue if you require this for other
tags.
* markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
have markdown processing be done on its contents. Similar to
<http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
* smarty-pants: Replaces ' and " with curly quotation marks or curly
apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,
and ellipses.
* spoiler: A special kind of blockquote commonly hidden behind a
click on SO. Syntax per <http://meta.stackexchange.com/a/72878>.
* toc: The returned HTML string gets a new "toc_html" attribute which is
a Table of Contents for the document. (experimental)
* xml: Passes one-liner processing instructions and namespaced XML tags.
* tables: Tables using the same format as GFM
<https://help.github.com/articles/github-flavored-markdown#tables> and
PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>.
* wiki-tables: Google Code Wiki-style tables. See
<http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
"""
# not yet sure if there implications with this. Compare 'pydoc sre'
# and 'perldoc perlre'.
__version_info__ = (2, 1, 1)
__version_info__ = (2, 3, 2)
__version__ = '.'.join(map(str, __version_info__))
__author__ = "Trent Mick"
import os
import sys
from pprint import pprint
import re
import logging
try:
import codecs
#---- Python version compat
# ---- Python version compat
try:
from urllib.parse import quote # python3
except ImportError:
from urllib import quote # python2
if sys.version_info[:2] < (2,4):
from sets import Set as set
if sys.version_info[:2] < (2, 4):
def reversed(sequence):
for i in sequence[::-1]:
yield i
base_string_type = str
#---- globals
# ---- globals
DEBUG = False
log = logging.getLogger("markdown")
for ch in '\\`*_{}[]()>#+-.!'])
#---- exceptions
# ---- exceptions
class MarkdownError(Exception):
pass
#---- public api
# ---- public api
def markdown_path(path, encoding="utf-8",
html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
link_patterns=link_patterns,
use_file_vars=use_file_vars).convert(text)
def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
safe_mode=None, extras=None, link_patterns=None,
use_file_vars=False):
link_patterns=link_patterns,
use_file_vars=use_file_vars).convert(text)
class Markdown(object):
# The dict of "extras" to enable in processing -- a mapping of
# extra name to argument for the extra. Most extras do not have an
extras = dict([(e, None) for e in extras])
self.extras.update(extras)
assert isinstance(self.extras, dict)
if "toc" in self.extras and not "header-ids" in self.extras:
if "toc" in self.extras and "header-ids" not in self.extras:
self.extras["header-ids"] = None # "toc" implies "header-ids"
self._instance_extras = self.extras.copy()
self.footnotes = {}
self.footnote_ids = []
if "header-ids" in self.extras:
self._count_from_header_id = {} # no `defaultdict` in Python 2.4
self._count_from_header_id = {} # no `defaultdict` in Python 2.4
if "metadata" in self.extras:
self.metadata = {}
# should only be used in <a> tags with an "href" attribute.
_a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE)
# Opens the linked document in a new window or tab
# should only used in <a> tags with an "target" attribute.
# same with _a_nofollow
_a_blank = _a_nofollow
def convert(self, text):
"""Convert the given text."""
# Main function. The order in which other subs are called here is
self.reset()
if not isinstance(text, unicode):
#TODO: perhaps shouldn't presume UTF-8 for string input?
# TODO: perhaps shouldn't presume UTF-8 for string input?
text = unicode(text, 'utf-8')
if self.use_file_vars:
self.extras[ename] = earg
# Standardize line endings:
text = re.sub("\r\n|\r", "\n", text)
text = text.replace("\r\n", "\n")
text = text.replace("\r", "\n")
# Make sure $text ends with a couple of newlines:
text += "\n\n"
text = self.preprocess(text)
if "fenced-code-blocks" in self.extras and not self.safe_mode:
text = self._do_fenced_code_blocks(text)
if self.safe_mode:
text = self._hash_html_spans(text)
# Turn block-level HTML blocks into hash entries
text = self._hash_html_blocks(text, raw=True)
if "fenced-code-blocks" in self.extras:
if "fenced-code-blocks" in self.extras and self.safe_mode:
text = self._do_fenced_code_blocks(text)
# Strip link definitions, store in hashes.
if "nofollow" in self.extras:
text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text)
if "target-blank-links" in self.extras:
text = self._a_blank.sub(r'<\1 target="_blank"\2', text)
text += "\n"
rv = UnicodeWithAttrs(text)
"""
return text
# Is metadata if the content starts with '---'-fenced `key: value`
# Is metadata if the content starts with optional '---'-fenced `key: value`
# pairs. E.g. (indented for presentation):
# ---
# foo: bar
# another-var: blah blah
# ---
_metadata_pat = re.compile("""^---[ \t]*\n((?:[ \t]*[^ \t:]+[ \t]*:[^\n]*\n)+)---[ \t]*\n""")
# # header
# or:
# foo: bar
# another-var: blah blah
#
# # header
_metadata_pat = re.compile(r"""
^
(?:---[\ \t]*\n)? # optional "---"
((?:[ \t]*[^ \t:]+[\ \t]*:[^\n]*\n)+) # "key: value" pairs
(?:---[ \t]*)? # optional "---"
\n""",
re.VERBOSE
)
def _extract_metadata(self, text):
# fast test
if not text.startswith("---"):
return text
match = self._metadata_pat.match(text)
if not match:
return text
return tail
_emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
# This regular expression is intended to match blocks like this:
# PREFIX Local Variables: SUFFIX
http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
"""
emacs_vars = {}
SIZE = pow(2, 13) # 8kB
SIZE = pow(2, 13) # 8kB
# Search near the start for a '-*-'-style one-liner of variables.
head = text[:SIZE]
prefix = match.group("prefix")
suffix = match.group("suffix")
lines = match.group("content").splitlines(0)
#print "prefix=%r, suffix=%r, content=%r, lines: %s"\
# print "prefix=%r, suffix=%r, content=%r, lines: %s"\
# % (prefix, suffix, match.group("content"), lines)
# Validate the Local Variables block: proper prefix and suffix
# Parse out one emacs var per line.
continued_for = None
for line in lines[:-1]: # no var on the last line ("PREFIX End:")
if prefix: line = line[len(prefix):] # strip prefix
if suffix: line = line[:-len(suffix)] # strip suffix
for line in lines[:-1]: # no var on the last line ("PREFIX End:")
if prefix: line = line[len(prefix):] # strip prefix
if suffix: line = line[:-len(suffix)] # strip suffix
line = line.strip()
if continued_for:
variable = continued_for
return emacs_vars
# Cribbed from a post by Bart Lateur:
# <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
_detab_re = re.compile(r'(.*?)\t', re.M)
def _detab_sub(self, match):
g1 = match.group(1)
return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
def _detab_line(self, line):
r"""Recusively convert tabs to spaces in a single line.
Called from _detab()."""
if '\t' not in line:
return line
chunk1, chunk2 = line.split('\t', 1)
chunk1 += (' ' * (self.tab_width - len(chunk1) % self.tab_width))
output = chunk1 + chunk2
return self._detab_line(output)
def _detab(self, text):
r"""Remove (leading?) tabs from a file.
r"""Iterate text line by line and convert tabs to spaces.
>>> m = Markdown()
>>> m._detab("\tfoo")
"""
if '\t' not in text:
return text
return self._detab_re.subn(self._detab_sub, text)[0]
output = []
for line in text.splitlines():
output.append(self._detab_line(line))
return '\n'.join(output)
# I broke out the html5 tags here and add them to _block_tags_a and
# _block_tags_b. This way html5 tags are easy to keep track of.
re.X | re.M)
return footnote_def_re.sub(self._extract_footnote_def_sub, text)
_hr_data = [
('*', re.compile(r"^[ ]{0,3}\*(.*?)$", re.M)),
('-', re.compile(r"^[ ]{0,3}\-(.*?)$", re.M)),
('_', re.compile(r"^[ ]{0,3}\_(.*?)$", re.M)),
]
_hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M)
def _run_block_gamut(self, text):
# These are all the transformations that form block-level
# Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
# hr chars to one or two. We'll reproduce that limit here.
hr = "\n<hr"+self.empty_element_suffix+"\n"
for ch, regex in self._hr_data:
if ch in text:
for m in reversed(list(regex.finditer(text))):
tail = m.group(1).rstrip()
if not tail.strip(ch + ' ') and tail.count(" ") == 0:
start, end = m.span()
text = text[:start] + hr + text[end:]
text = re.sub(self._hr_re, hr, text)
text = self._do_lists(text)
text = self._prepare_pyshell_blocks(text)
if "wiki-tables" in self.extras:
text = self._do_wiki_tables(text)
if "tables" in self.extras:
text = self._do_tables(text)
text = self._do_code_blocks(text)
lines = match.group(0).splitlines(0)
_dedentlines(lines)
indent = ' ' * self.tab_width
s = ('\n' # separate from possible cuddled paragraph
s = ('\n' # separate from possible cuddled paragraph
+ indent + ('\n'+indent).join(lines)
+ '\n\n')
return s
return _pyshell_block_re.sub(self._pyshell_block_sub, text)
def _table_sub(self, match):
trim_space_re = '^[ \t\n]+|[ \t\n]+$'
trim_bar_re = '^\||\|$'
head, underline, body = match.groups()
# Determine aligns for columns.
cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)).split('|')]
align_from_col_idx = {}
for col_idx, col in enumerate(cols):
if col[0] == ':' and col[-1] == ':':
align_from_col_idx[col_idx] = ' align="center"'
elif col[0] == ':':
align_from_col_idx[col_idx] = ' align="left"'
elif col[-1] == ':':
align_from_col_idx[col_idx] = ' align="right"'
# thead
hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<thead>', '<tr>']
cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)).split('|')]
for col_idx, col in enumerate(cols):
hlines.append(' <th%s>%s</th>' % (
align_from_col_idx.get(col_idx, ''),
self._run_span_gamut(col)
))
hlines.append('</tr>')
hlines.append('</thead>')
# tbody
hlines.append('<tbody>')
for line in body.strip('\n').split('\n'):
hlines.append('<tr>')
cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)).split('|')]
for col_idx, col in enumerate(cols):
hlines.append(' <td%s>%s</td>' % (
align_from_col_idx.get(col_idx, ''),
self._run_span_gamut(col)
))
hlines.append('</tr>')
hlines.append('</tbody>')
hlines.append('</table>')
return '\n'.join(hlines) + '\n'
def _do_tables(self, text):
"""Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
"""
less_than_tab = self.tab_width - 1
table_re = re.compile(r'''
(?:(?<=\n\n)|\A\n?) # leading blank line
^[ ]{0,%d} # allowed whitespace
(.*[|].*) \n # $1: header row (at least one pipe)
^[ ]{0,%d} # allowed whitespace
( # $2: underline row
# underline row with leading bar
(?: \|\ *:?-+:?\ * )+ \|? \n
|
# or, underline row without leading bar
(?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \n
)
( # $3: data rows
(?:
^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces
.*\|.* \n
)+
)
''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
return table_re.sub(self._table_sub, text)
def _wiki_table_sub(self, match):
ttext = match.group(0).strip()
#print 'wiki table: %r' % match.group(0)
# print 'wiki table: %r' % match.group(0)
rows = []
for line in ttext.splitlines(0):
line = line.strip()[2:-2].strip()
row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
rows.append(row)
#pprint(rows)
hlines = ['<table>', '<tbody>']
# pprint(rows)
hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<tbody>']
for row in rows:
hrow = ['<tr>']
for cell in row:
text = self._encode_amps_and_angles(text)
if "strike" in self.extras:
text = self._do_strike(text)
text = self._do_italics_and_bold(text)
if "smarty-pants" in self.extras:
text = self._do_smart_punctuation(text)
# Do hard breaks:
text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
if "break-on-newline" in self.extras:
text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text)
else:
text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
return text
raise MarkdownError("invalid value for 'safe_mode': %r (must be "
"'escape' or 'replace')" % self.safe_mode)
_tail_of_inline_link_re = re.compile(r'''
# Match tail of: [text](/url/) or [text](/url/ "title")
\( # literal paren
[ \t]*
(?P<url> # \1
<.*?>
|
.*?
)
[ \t]*
( # \2
(['"]) # quote char = \3
_inline_link_title = re.compile(r'''
( # \1
[ \t]+
(['"]) # quote char = \2
(?P<title>.*?)
\3 # matching quote
\2
)? # title is optional
\)
\)$
''', re.X | re.S)
_tail_of_reference_link_re = re.compile(r'''
# Match tail of: [text][id]
\]
''', re.X | re.S)
_whitespace = re.compile(r'\s*')
_strip_anglebrackets = re.compile(r'<(.*)>.*')
def _find_non_whitespace(self, text, start):
"""Returns the index of the first non-whitespace character in text
after (and including) start
"""
match = self._whitespace.match(text, start)
return match.end()
def _find_balanced(self, text, start, open_c, close_c):
"""Returns the index where the open_c and close_c characters balance
out - the same number of open_c and close_c are encountered - or the
end of string if it's reached before the balance point is found.
"""
i = start
l = len(text)
count = 1
while count > 0 and i < l:
if text[i] == open_c:
count += 1
elif text[i] == close_c:
count -= 1
i += 1
return i
def _extract_url_and_title(self, text, start):
"""Extracts the url and (optional) title from the tail of a link"""
# text[start] equals the opening parenthesis
idx = self._find_non_whitespace(text, start+1)
if idx == len(text):
return None, None, None
end_idx = idx
has_anglebrackets = text[idx] == "<"
if has_anglebrackets:
end_idx = self._find_balanced(text, end_idx+1, "<", ">")
end_idx = self._find_balanced(text, end_idx, "(", ")")
match = self._inline_link_title.search(text, idx, end_idx)
if not match:
return None, None, None
url, title = text[idx:match.start()], match.group("title")
if has_anglebrackets:
url = self._strip_anglebrackets.sub(r'\1', url)
return url, title, end_idx
def _do_links(self, text):
"""Turn Markdown link shortcuts into XHTML <a> and <img> tags.
anchor_allowed_pos = 0
curr_pos = 0
while True: # Handle the next link.
while True: # Handle the next link.
# The next '[' is the start of:
# - an inline anchor: [text](url "title")
# - a reference anchor: [text][id]
return text
# Inline anchor or img?
if text[p] == '(': # attempt at perf improvement
match = self._tail_of_inline_link_re.match(text, p)
if match:
if text[p] == '(': # attempt at perf improvement
url, title, url_end_idx = self._extract_url_and_title(text, p)
if url is not None:
# Handle an inline anchor or img.
is_img = start_idx > 0 and text[start_idx-1] == "!"
if is_img:
is_inline_img = start_idx > 0 and text[start_idx-1] == "#"
if is_inline_img:
start_idx -= 1
is_img = 1
start_idx -= 1
is_img = 1
url, title = match.group("url"), match.group("title")
if url and url[0] == '<':
url = url[1:-1] # '<url>' -> 'url'
# We've got to encode these to avoid conflicting
# with italics/bold.
url = url.replace('*', self._escape_table['*']) \
else:
title_str = ''
if is_img:
img_class_str = self._html_class_str_from_tag("img")
if is_inline_img:
result = '<img class="inlineimage" src="%s" alt="%s"%s%s' \
% (url.replace('"', '&quot;'),
_xml_escape_attr(link_text),
title_str, self.empty_element_suffix)
else:
result = '<img src="%s" alt="%s"%s%s' \
% (url.replace('"', '&quot;'),
_xml_escape_attr(link_text),
title_str, self.empty_element_suffix)
img_class_str = ' class="inlineimage"'
result = '<img src="%s" alt="%s"%s%s%s' \
% (url.replace('"', '&quot;'),
_xml_escape_attr(link_text),
title_str, img_class_str, self.empty_element_suffix)
if "smarty-pants" in self.extras:
result = result.replace('"', self._escape_table['"'])
curr_pos = start_idx + len(result)
text = text[:start_idx] + result + text[match.end():]
text = text[:start_idx] + result + text[url_end_idx:]
elif start_idx >= anchor_allowed_pos:
result_head = '<a href="%s"%s>' % (url, title_str)
result = '%s%s</a>' % (result_head, link_text)
# anchor_allowed_pos on.
curr_pos = start_idx + len(result_head)
anchor_allowed_pos = start_idx + len(result)
text = text[:start_idx] + result + text[match.end():]
text = text[:start_idx] + result + text[url_end_idx:]
else:
# Anchor not allowed here.
curr_pos = start_idx + 1
.replace('_', self._escape_table['_'])
title = self.titles.get(link_id)
if title:
before = title
title = _xml_escape_attr(title) \
.replace('*', self._escape_table['*']) \
.replace('_', self._escape_table['_'])
else:
title_str = ''
if is_img:
result = '<img src="%s" alt="%s"%s%s' \
img_class_str = self._html_class_str_from_tag("img")
result = '<img src="%s" alt="%s"%s%s%s' \
% (url.replace('"', '&quot;'),
link_text.replace('"', '&quot;'),
title_str, self.empty_element_suffix)
title_str, img_class_str, self.empty_element_suffix)
if "smarty-pants" in self.extras:
result = result.replace('"', self._escape_table['"'])
curr_pos = start_idx + len(result)
self._toc = []
self._toc.append((level, id, self._unescape_special_chars(name)))
_setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
def _setext_h_sub(self, match):
n = {"=": 1, "-": 2}[match.group(2)[0]]
demote_headers = self.extras.get("demote-headers")
if demote_headers:
n = min(n + demote_headers, 6)
header_id_attr = ""
if "header-ids" in self.extras:
header_id = self.header_id_from_text(match.group(1),
self.extras["header-ids"], n)
if header_id:
header_id_attr = ' id="%s"' % header_id
html = self._run_span_gamut(match.group(1))
if "toc" in self.extras and header_id:
self._toc_add_entry(n, header_id, html)
return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
_atx_h_re = re.compile(r'''
^(\#{1,6}) # \1 = string of #'s
[ \t]+
_h_re_base = r'''
(^(.+)[ \t]*\n(=+|-+)[ \t]*\n+)
|
(^(\#{1,6}) # \1 = string of #'s
[ \t]%s
(.+?) # \2 = Header text
[ \t]*
(?<!\\) # ensure not an escaped trailing '#'
\#* # optional closing #'s (not counted)
\n+
''', re.X | re.M)
def _atx_h_sub(self, match):
n = len(match.group(1))
)
'''
_h_re = re.compile(_h_re_base % '*', re.X | re.M)
_h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)
def _h_sub(self, match):
if match.group(1) is not None:
# Setext header
n = {"=": 1, "-": 2}[match.group(3)[0]]
header_group = match.group(2)
else:
# atx header
n = len(match.group(5))
header_group = match.group(6)
demote_headers = self.extras.get("demote-headers")
if demote_headers:
n = min(n + demote_headers, 6)
header_id_attr = ""
if "header-ids" in self.extras:
header_id = self.header_id_from_text(match.group(2),
header_id = self.header_id_from_text(header_group,
self.extras["header-ids"], n)
if header_id:
header_id_attr = ' id="%s"' % header_id
html = self._run_span_gamut(match.group(2))
html = self._run_span_gamut(header_group)
if "toc" in self.extras and header_id:
self._toc_add_entry(n, header_id, html)
return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
#
# Header 2
# --------
text = self._setext_h_re.sub(self._setext_h_sub, text)
# atx-style headers:
# # Header 1
# ## Header 2 with closing hashes ##
# ...
# ###### Header 6
text = self._atx_h_re.sub(self._atx_h_sub, text)
return text
if 'tag-friendly' in self.extras:
return self._h_re_tag_friendly.sub(self._h_sub, text)
return self._h_re.sub(self._h_sub, text)
_marker_ul_chars = '*+-'
_marker_ul_chars = '*+-'
_marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
_marker_ul = '(?:[%s])' % _marker_ul_chars
_marker_ol = r'(?:\d+\.)'
hits.sort()
match = hits[0][1]
start, end = match.span()
text = text[:start] + self._list_sub(match) + text[end:]
pos = end
middle = self._list_sub(match)
text = text[:start] + middle + text[end:]
pos = start + len(middle) # start pos for next attempted match
return text
(^[ \t]*) # leading whitespace = \2
(?P<marker>%s) [ \t]+ # list marker = \3
((?:.+?) # list item text = \4
(\n{1,2})) # eols = \5
(\n{1,2})) # eols = \5
(?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+))
''' % (_marker_any, _marker_any),
re.M | re.X | re.S)
_task_list_item_re = re.compile(r'''
(\[[\ x]\])[ \t]+ # tasklist marker = \1
(.*) # list item text = \2
''', re.M | re.X | re.S)
_task_list_warpper_str = r'<p><input type="checkbox" class="task-list-item-checkbox" %sdisabled>%s</p>'
def _task_list_item_sub(self, match):
marker = match.group(1)
item_text = match.group(2)
if marker == '[x]':
return self._task_list_warpper_str % ('checked ', item_text)
elif marker == '[ ]':
return self._task_list_warpper_str % ('', item_text)
_last_li_endswith_two_eols = False
def _list_item_sub(self, match):
item = match.group(4)
leading_line = match.group(1)
leading_space = match.group(2)
if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
item = self._run_block_gamut(self._outdent(item))
else:
item = item[:-1]
item = self._run_span_gamut(item)
self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
if "task_list" in self.extras:
item = self._task_list_item_re.sub(self._task_list_item_sub, item)
return "<li>%s</li>\n" % item
def _process_list_items(self, list_str):
formatter_opts = self.extras['code-color'] or {}
if lexer_name:
def unhash_code(codeblock):
for key, sanitized in list(self.html_spans.items()):
codeblock = codeblock.replace(key, sanitized)
replacements = [
("&amp;", "&"),
("&lt;", "<"),
("&gt;", ">")
]
for old, new in replacements:
codeblock = codeblock.replace(old, new)
return codeblock
lexer = self._get_pygments_lexer(lexer_name)
if lexer:
codeblock = unhash_code( codeblock )
colored = self._color_with_pygments(codeblock, lexer,
**formatter_opts)
return "\n\n%s\n\n" % colored
)+
)
((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
# Lookahead to make sure this block isn't already in a code block.
# Needed when syntax highlighting is being used.
(?![^<]*\</code\>)
''' % (self.tab_width, self.tab_width),
re.M | re.X)
return code_block_re.sub(self._code_block_sub, text)
_fenced_code_block_re = re.compile(r'''
(?:\n\n|\A\n?)
(?:\n+|\A\n?)
^```([\w+-]+)?[ \t]*\n # opening fence, $1 = optional lang
(.*?) # $2 = code block content
^```[ \t]*\n # closing fence
''', re.M | re.X | re.S)
def _fenced_code_block_sub(self, match):
return self._code_block_sub(match, is_fenced_code_block=True);
return self._code_block_sub(match, is_fenced_code_block=True)
def _do_fenced_code_blocks(self, text):
"""Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
self._escape_table[text] = hashed
return hashed
_strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S)
def _do_strike(self, text):
text = self._strike_re.sub(r"<strike>\1</strike>", text)
return text
_strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
_em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
_code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
_code_friendly_line_re = re.compile(r"\~\~(?=\S)(.+?)(?<=\S)\~\~", re.S)
_code_friendly_underline_re = re.compile(r"\~(?=\S)(.+?)(?<=\S)\~", re.S)
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
_code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
def _do_italics_and_bold(self, text):
# <strong> must go first:
if "code-friendly" in self.extras:
<http://code.google.com/p/python-markdown2/issues/detail?id=42> for a
discussion of some diversion from the original SmartyPants.
"""
if "'" in text: # guard for perf
if "'" in text: # guard for perf
text = self._do_smart_contractions(text)
text = self._opening_single_quote_re.sub("&#8216;", text)
text = self._closing_single_quote_re.sub("&#8217;", text)
if '"' in text: # guard for perf
if '"' in text: # guard for perf
text = self._opening_double_quote_re.sub("&#8220;", text)
text = self._closing_double_quote_re.sub("&#8221;", text)
text = text.replace(". . .", "&#8230;")
return text
_block_quote_re = re.compile(r'''
_block_quote_base = r'''
( # Wrap whole match in \1
(
^[ \t]*>[ \t]? # '>' at the start of a line
^[ \t]*>%s[ \t]? # '>' at the start of a line
.+\n # rest of the first line
(.+\n)* # subsequent consecutive lines
\n* # blanks
)+
)
''', re.M | re.X)
_bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
'''
_block_quote_re = re.compile(_block_quote_base % '', re.M | re.X)
_block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M | re.X)
_bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M)
_bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M)
_bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M)
_html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
def _dedent_two_spaces_sub(self, match):
return re.sub(r'(?m)^ ', '', match.group(1))
def _block_quote_sub(self, match):
bq = match.group(1)
bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting
bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines
is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq)
# trim one level of quoting
if is_spoiler:
bq = self._bq_one_level_re_spoiler.sub('', bq)
else:
bq = self._bq_one_level_re.sub('', bq)
# trim whitespace-only lines
bq = self._ws_only_line_re.sub('', bq)
bq = self._run_block_gamut(bq) # recurse
bq = re.sub('(?m)^', ' ', bq)
# These leading spaces screw with <pre> content, so we need to fix that:
bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
return "<blockquote>\n%s\n</blockquote>\n\n" % bq
if is_spoiler:
return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq
else:
return '<blockquote>\n%s\n</blockquote>\n\n' % bq
def _do_block_quotes(self, text):
if '>' not in text:
return text
return self._block_quote_re.sub(self._block_quote_sub, text)
if 'spoiler' in self.extras:
return self._block_quote_re_spoiler.sub(self._block_quote_sub, text)
else:
return self._block_quote_re.sub(self._block_quote_sub, text)
def _form_paragraphs(self, text):
# Strip leading and trailing lines:
'&#8617;</a>' % (id, i+1))
if footer[-1].endswith("</p>"):
footer[-1] = footer[-1][:-len("</p>")] \
+ '&nbsp;' + backlink + "</p>"
+ '&#160;' + backlink + "</p>"
else:
footer.append("\n<p>%s</p>" % backlink)
footer.append('</li>')
extras = ["footnotes", "code-color"]
#---- internal support functions
# ---- internal support functions
class UnicodeWithAttrs(unicode):
"""A subclass of unicode used for the return value of conversion to
return function(*args + rest, **combined)
return result
# Recipe: regex_from_encoded_pattern (1.0)
def _regex_from_encoded_pattern(s):
"""'foo' -> re.compile(re.escape('foo'))
"(must be one of '%s')"
% (char, s, ''.join(list(flag_from_char.keys()))))
return re.compile(s[1:idx], flags)
else: # not an encoded regex
else: # not an encoded regex
return re.compile(re.escape(s))
# Recipe: dedent (0.1.2)
def _dedentlines(lines, tabsize=8, skip_first_line=False):
"""_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
if DEBUG:
print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
% (tabsize, skip_first_line))
indents = []
margin = None
for i, line in enumerate(lines):
if i == 0 and skip_first_line: continue
elif ch == '\t':
indent += tabsize - (indent % tabsize)
elif ch in '\r\n':
continue # skip all-whitespace lines
continue # skip all-whitespace lines
else:
break
else:
continue # skip all-whitespace lines
continue # skip all-whitespace lines
if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
if margin is None:
margin = indent
lines[i] = lines[i][removed:]
return lines
def _dedent(text, tabsize=8, skip_first_line=False):
"""_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
class _memoized(object):
"""Decorator that caches a function's return value each time it is called.
If called later with the same arguments, the cached value is returned, and
not re-evaluated.
http://wiki.python.org/moin/PythonDecoratorLibrary
"""
def __init__(self, func):
self.func = func
self.cache = {}
def __call__(self, *args):
try:
return self.cache[args]
except KeyError:
self.cache[args] = value = self.func(*args)
return value
except TypeError:
# uncachable -- for instance, passing a list as an argument.
# Better to not cache than to blow up entirely.
return self.func(*args)
def __repr__(self):
"""Return the function's docstring."""
return self.func.__doc__
"""Decorator that caches a function's return value each time it is called.
If called later with the same arguments, the cached value is returned, and
not re-evaluated.
http://wiki.python.org/moin/PythonDecoratorLibrary
"""
def __init__(self, func):
self.func = func
self.cache = {}
def __call__(self, *args):
try:
return self.cache[args]
except KeyError:
self.cache[args] = value = self.func(*args)
return value
except TypeError:
# uncachable -- for instance, passing a list as an argument.
# Better to not cache than to blow up entirely.
return self.func(*args)
def __repr__(self):
"""Return the function's docstring."""
return self.func.__doc__
def _xml_oneliner_re_from_tab_width(tab_width):
""" % (tab_width - 1), re.X)
_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
def _hr_tag_re_from_tab_width(tab_width):
return re.compile(r"""
return re.compile(r"""
(?:
(?<=\n\n) # Starting after a blank line
| # or
return '&#%s;' % ord(ch)
#---- mainline
# ---- mainline
class _NoReflowFormatter(optparse.IndentedHelpFormatter):
"""An optparse formatter that does NOT reflow the description."""
def format_description(self, description):
return description or ""
def _test():
import doctest
doctest.testmod()
def main(argv=None):
if argv is None:
argv = sys.argv
sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
if extras and "toc" in extras:
log.debug("toc_html: " +
html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
str(html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')))
if opts.compare:
test_dir = join(dirname(dirname(abspath(__file__))), "test")
if exists(join(test_dir, "test_markdown2.py")):
if __name__ == "__main__":
sys.exit( main(sys.argv) )
sys.exit(main(sys.argv))

Archive Download the corresponding diff file

Branches

Tags