2
# Copyright (c) 2007-2008 ActiveState Corp.
3
# License: MIT (http://www.opensource.org/licenses/mit-license.php)
5
r"""A fast and complete Python implementation of Markdown.
7
[from http://daringfireball.net/projects/markdown/]
8
> Markdown is a text-to-HTML filter; it translates an easy-to-read /
9
> easy-to-write structured text format into HTML. Markdown's text
10
> format is most similar to that of plain text email, and supports
11
> features such as headers, *emphasis*, code blocks, blockquotes, and
14
> Markdown's syntax is designed not as a generic markup language, but
15
> specifically to serve as a front-end to (X)HTML. You can use span-level
16
> HTML tags anywhere in a Markdown document, and you can use block level
17
> HTML tags (like <div> and <table> as well).
22
>>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)`
23
u'<p><em>boo!</em></p>\n'
25
>>> markdowner = Markdown()
26
>>> markdowner.convert("*boo!*")
27
u'<p><em>boo!</em></p>\n'
28
>>> markdowner.convert("**boom!**")
29
u'<p><strong>boom!</strong></p>\n'
31
This implementation of Markdown implements the full "core" syntax plus a
32
number of extras (e.g., code syntax coloring, footnotes) as described on
33
<http://code.google.com/p/python-markdown2/wiki/Extras>.
36
cmdln_desc = """A fast and complete Python implementation of Markdown, a
37
text-to-HTML conversion tool for web writers.
41
# - There is already a Python markdown processor
42
# (http://www.freewisdom.org/projects/python-markdown/).
43
# - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm
44
# not yet sure if there implications with this. Compare 'pydoc sre'
45
# and 'perldoc perlre'.
47
__version_info__ = (1, 0, 1, 14) # first three nums match Markdown.pl
48
__version__ = '1.0.1.14'
49
__author__ = "Trent Mick"
53
from pprint import pprint
57
from hashlib import md5
61
from random import random
66
#---- Python version compat
68
if sys.version_info[:2] < (2,4):
69
from sets import Set as set
70
def reversed(sequence):
71
for i in sequence[::-1]:
73
def _unicode_decode(s, encoding, errors='xmlcharrefreplace'):
74
return unicode(s, encoding, errors)
76
def _unicode_decode(s, encoding, errors='strict'):
77
return s.decode(encoding, errors)
83
log = logging.getLogger("markdown")
87
# Table of hash values for escaped characters:
89
# Lame attempt to avoid possible collision with someone actually
90
# using the MD5 hexdigest of one of these chars in there text.
91
# Other ideas: random.random(), uuid.uuid()
92
#return md5(s).hexdigest() # Markdown.pl effectively does this.
93
return 'md5-'+md5(s).hexdigest()
94
g_escape_table = dict([(ch, _escape_hash(ch))
95
for ch in '\\`*_{}[]()>#+-.!'])
101
class MarkdownError(Exception):
108
def markdown_path(path, encoding="utf-8",
109
html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
110
safe_mode=None, extras=None, link_patterns=None,
111
use_file_vars=False):
112
text = codecs.open(path, 'r', encoding).read()
113
return Markdown(html4tags=html4tags, tab_width=tab_width,
114
safe_mode=safe_mode, extras=extras,
115
link_patterns=link_patterns,
116
use_file_vars=use_file_vars).convert(text)
118
def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
119
safe_mode=None, extras=None, link_patterns=None,
120
use_file_vars=False):
121
return Markdown(html4tags=html4tags, tab_width=tab_width,
122
safe_mode=safe_mode, extras=extras,
123
link_patterns=link_patterns,
124
use_file_vars=use_file_vars).convert(text)
126
class Markdown(object):
127
# The dict of "extras" to enable in processing -- a mapping of
128
# extra name to argument for the extra. Most extras do not have an
129
# argument, in which case the value is None.
131
# This can be set via (a) subclassing and (b) the constructor
139
html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py
141
# Used to track when we're inside an ordered or unordered list
142
# (see _ProcessListItems() for details):
145
_ws_only_line_re = re.compile(r"^[ \t]+$", re.M)
147
def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
148
extras=None, link_patterns=None, use_file_vars=False):
150
self.empty_element_suffix = ">"
152
self.empty_element_suffix = " />"
153
self.tab_width = tab_width
155
# For compatibility with earlier markdown2.py and with
156
# markdown.py's safe_mode being a boolean,
157
# safe_mode == True -> "replace"
158
if safe_mode is True:
159
self.safe_mode = "replace"
161
self.safe_mode = safe_mode
163
if self.extras is None:
165
elif not isinstance(self.extras, dict):
166
self.extras = dict([(e, None) for e in self.extras])
168
if not isinstance(extras, dict):
169
extras = dict([(e, None) for e in extras])
170
self.extras.update(extras)
171
assert isinstance(self.extras, dict)
172
self._instance_extras = self.extras.copy()
173
self.link_patterns = link_patterns
174
self.use_file_vars = use_file_vars
175
self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M)
180
self.html_blocks = {}
183
self.extras = self._instance_extras.copy()
184
if "footnotes" in self.extras:
186
self.footnote_ids = []
188
def convert(self, text):
189
"""Convert the given text."""
190
# Main function. The order in which other subs are called here is
191
# essential. Link and image substitutions need to happen before
192
# _EscapeSpecialChars(), so that any *'s or _'s in the <a>
193
# and <img> tags get encoded.
195
# Clear the global hashes. If we don't clear these, you get conflicts
196
# from other articles when generating a page which contains more than
197
# one article (e.g. an index page that shows the N most recent
201
if not isinstance(text, unicode):
202
#TODO: perhaps shouldn't presume UTF-8 for string input?
203
text = unicode(text, 'utf-8')
205
if self.use_file_vars:
206
# Look for emacs-style file variable hints.
207
emacs_vars = self._get_emacs_vars(text)
208
if "markdown-extras" in emacs_vars:
209
splitter = re.compile("[ ,]+")
210
for e in splitter.split(emacs_vars["markdown-extras"]):
212
ename, earg = e.split('=', 1)
218
ename, earg = e, None
219
self.extras[ename] = earg
221
# Standardize line endings:
222
text = re.sub("\r\n|\r", "\n", text)
224
# Make sure $text ends with a couple of newlines:
227
# Convert all tabs to spaces.
228
text = self._detab(text)
230
# Strip any lines consisting only of spaces and tabs.
231
# This makes subsequent regexen easier to write, because we can
232
# match consecutive blank lines with /\n+/ instead of something
233
# contorted like /[ \t]*\n+/ .
234
text = self._ws_only_line_re.sub("", text)
237
text = self._hash_html_spans(text)
239
# Turn block-level HTML blocks into hash entries
240
text = self._hash_html_blocks(text, raw=True)
242
# Strip link definitions, store in hashes.
243
if "footnotes" in self.extras:
244
# Must do footnotes first because an unlucky footnote defn
245
# looks like a link defn:
246
# [^4]: this "looks like a link defn"
247
text = self._strip_footnote_definitions(text)
248
text = self._strip_link_definitions(text)
250
text = self._run_block_gamut(text)
252
if "footnotes" in self.extras:
253
text = self._add_footnotes(text)
255
text = self._unescape_special_chars(text)
258
text = self._unhash_html_spans(text)
263
_emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
264
# This regular expression is intended to match blocks like this:
265
# PREFIX Local Variables: SUFFIX
266
# PREFIX mode: Tcl SUFFIX
269
# - "[ \t]" is used instead of "\s" to specifically exclude newlines
270
# - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does
271
# not like anything other than Unix-style line terminators.
272
_emacs_local_vars_pat = re.compile(r"""^
273
(?P<prefix>(?:[^\r\n|\n|\r])*?)
274
[\ \t]*Local\ Variables:[\ \t]*
275
(?P<suffix>.*?)(?:\r\n|\n|\r)
276
(?P<content>.*?\1End:)
277
""", re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
279
def _get_emacs_vars(self, text):
280
"""Return a dictionary of emacs-style local variables.
282
Parsing is done loosely according to this spec (and according to
283
some in-practice deviations from this):
284
http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
287
SIZE = pow(2, 13) # 8kB
289
# Search near the start for a '-*-'-style one-liner of variables.
292
match = self._emacs_oneliner_vars_pat.search(head)
294
emacs_vars_str = match.group(1)
295
assert '\n' not in emacs_vars_str
296
emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')
298
if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:
299
# While not in the spec, this form is allowed by emacs:
301
# where the implied "variable" is "mode". This form
302
# is only allowed if there are no other variables.
303
emacs_vars["mode"] = emacs_var_strs[0].strip()
305
for emacs_var_str in emacs_var_strs:
307
variable, value = emacs_var_str.strip().split(':', 1)
309
log.debug("emacs variables error: malformed -*- "
310
"line: %r", emacs_var_str)
312
# Lowercase the variable name because Emacs allows "Mode"
313
# or "mode" or "MoDe", etc.
314
emacs_vars[variable.lower()] = value.strip()
317
if "Local Variables" in tail:
318
match = self._emacs_local_vars_pat.search(tail)
320
prefix = match.group("prefix")
321
suffix = match.group("suffix")
322
lines = match.group("content").splitlines(0)
323
#print "prefix=%r, suffix=%r, content=%r, lines: %s"\
324
# % (prefix, suffix, match.group("content"), lines)
326
# Validate the Local Variables block: proper prefix and suffix
328
for i, line in enumerate(lines):
329
if not line.startswith(prefix):
330
log.debug("emacs variables error: line '%s' "
331
"does not use proper prefix '%s'"
334
# Don't validate suffix on last line. Emacs doesn't care,
336
if i != len(lines)-1 and not line.endswith(suffix):
337
log.debug("emacs variables error: line '%s' "
338
"does not use proper suffix '%s'"
342
# Parse out one emacs var per line.
344
for line in lines[:-1]: # no var on the last line ("PREFIX End:")
345
if prefix: line = line[len(prefix):] # strip prefix
346
if suffix: line = line[:-len(suffix)] # strip suffix
349
variable = continued_for
350
if line.endswith('\\'):
351
line = line[:-1].rstrip()
354
emacs_vars[variable] += ' ' + line
357
variable, value = line.split(':', 1)
359
log.debug("local variables error: missing colon "
360
"in local variables entry: '%s'" % line)
362
# Do NOT lowercase the variable name, because Emacs only
363
# allows "mode" (and not "Mode", "MoDe", etc.) in this block.
364
value = value.strip()
365
if value.endswith('\\'):
366
value = value[:-1].rstrip()
367
continued_for = variable
370
emacs_vars[variable] = value
373
for var, val in emacs_vars.items():
374
if len(val) > 1 and (val.startswith('"') and val.endswith('"')
375
or val.startswith('"') and val.endswith('"')):
376
emacs_vars[var] = val[1:-1]
380
# Cribbed from a post by Bart Lateur:
381
# <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
382
_detab_re = re.compile(r'(.*?)\t', re.M)
383
def _detab_sub(self, match):
385
return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
386
def _detab(self, text):
387
r"""Remove (leading?) tabs from a file.
390
>>> m._detab("\tfoo")
392
>>> m._detab(" \tfoo")
394
>>> m._detab("\t foo")
398
>>> m._detab(" foo\n\tbar\tblam")
403
return self._detab_re.subn(self._detab_sub, text)[0]
405
_block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del'
406
_strict_tag_block_re = re.compile(r"""
408
^ # start of line (with re.M)
409
<(%s) # start tag = \2
411
(.*\n)*? # any number of lines, minimally matching
412
</\2> # the matching end tag
413
[ \t]* # trailing spaces/tabs
414
(?=\n+|\Z) # followed by a newline or end of document
419
_block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math'
420
_liberal_tag_block_re = re.compile(r"""
422
^ # start of line (with re.M)
423
<(%s) # start tag = \2
425
(.*\n)*? # any number of lines, minimally matching
426
.*</\2> # the matching end tag
427
[ \t]* # trailing spaces/tabs
428
(?=\n+|\Z) # followed by a newline or end of document
433
def _hash_html_block_sub(self, match, raw=False):
434
html = match.group(1)
435
if raw and self.safe_mode:
436
html = self._sanitize_html(html)
437
key = _hash_text(html)
438
self.html_blocks[key] = html
439
return "\n\n" + key + "\n\n"
441
def _hash_html_blocks(self, text, raw=False):
442
"""Hashify HTML blocks
444
We only want to do this for block-level HTML tags, such as headers,
445
lists, and tables. That's because we still want to wrap <p>s around
446
"paragraphs" that are wrapped in non-block-level tags, such as anchors,
447
phrase emphasis, and spans. The list of tags we're looking for is
450
@param raw {boolean} indicates if these are raw HTML blocks in
451
the original source. It makes a difference in "safe" mode.
456
# Pass `raw` value into our calls to self._hash_html_block_sub.
457
hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
459
# First, look for nested blocks, e.g.:
462
# tags for inner block must be indented.
466
# The outermost tags must start at the left margin for this to match, and
467
# the inner nested divs must be indented.
468
# We need to do this before the next, more liberal match, because the next
469
# match will start at the first `<div>` and stop at the first `</div>`.
470
text = self._strict_tag_block_re.sub(hash_html_block_sub, text)
472
# Now match more liberally, simply from `\n<tag>` to `</tag>\n`
473
text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
475
# Special case just for <hr />. It was easier to make a special
476
# case than to make the other regex more complicated.
478
_hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
479
text = _hr_tag_re.sub(hash_html_block_sub, text)
481
# Special case for standalone HTML comments:
485
# Delimiters for next comment block.
487
start_idx = text.index("<!--", start)
488
except ValueError, ex:
491
end_idx = text.index("-->", start_idx) + 3
492
except ValueError, ex:
495
# Start position for next comment block search.
498
# Validate whitespace before comment.
500
# - Up to `tab_width - 1` spaces before start_idx.
501
for i in range(self.tab_width - 1):
502
if text[start_idx - 1] != ' ':
507
# - Must be preceded by 2 newlines or hit the start of
511
elif start_idx == 1 and text[0] == '\n':
512
start_idx = 0 # to match minute detail of Markdown.pl regex
513
elif text[start_idx-2:start_idx] == '\n\n':
518
# Validate whitespace after comment.
519
# - Any number of spaces and tabs.
520
while end_idx < len(text):
521
if text[end_idx] not in ' \t':
524
# - Must be following by 2 newlines or hit end of text.
525
if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'):
528
# Escape and hash (must match `_hash_html_block_sub`).
529
html = text[start_idx:end_idx]
530
if raw and self.safe_mode:
531
html = self._sanitize_html(html)
532
key = _hash_text(html)
533
self.html_blocks[key] = html
534
text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:]
536
if "xml" in self.extras:
537
# Treat XML processing instructions and namespaced one-liner
538
# tags as if they were block HTML tags. E.g., if standalone
539
# (i.e. are their own paragraph), the following do not get
540
# wrapped in a <p> tag:
543
# <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>
544
_xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
545
text = _xml_oneliner_re.sub(hash_html_block_sub, text)
549
def _strip_link_definitions(self, text):
550
# Strips link definitions from text, stores the URLs and titles in
552
less_than_tab = self.tab_width - 1
554
# Link defs are in the form:
555
# [id]: url "optional title"
556
_link_def_re = re.compile(r"""
557
^[ ]{0,%d}\[(.+)\]: # id = \1
559
\n? # maybe *one* newline
564
\n? # maybe one newline
566
(?<=\s) # lookbehind for whitespace
568
([^\n]*) # title = \3
571
)? # title is optional
573
""" % less_than_tab, re.X | re.M | re.U)
574
return _link_def_re.sub(self._extract_link_def_sub, text)
576
def _extract_link_def_sub(self, match):
577
id, url, title = match.groups()
578
key = id.lower() # Link IDs are case-insensitive
579
self.urls[key] = self._encode_amps_and_angles(url)
581
self.titles[key] = title.replace('"', '"')
584
def _extract_footnote_def_sub(self, match):
585
id, text = match.groups()
586
text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
587
normed_id = re.sub(r'\W', '-', id)
588
# Ensure footnote text ends with a couple newlines (for some
589
# block gamut matches).
590
self.footnotes[normed_id] = text + "\n\n"
593
def _strip_footnote_definitions(self, text):
594
"""A footnote definition looks like this:
596
[^note-id]: Text of the note.
598
May include one or more indented paragraphs.
601
- The 'note-id' can be pretty much anything, though typically it
602
is the number of the footnote.
603
- The first paragraph may start on the next line, like so:
608
less_than_tab = self.tab_width - 1
609
footnote_def_re = re.compile(r'''
610
^[ ]{0,%d}\[\^(.+)\]: # id = \1
612
( # footnote text = \2
613
# First line need not start with the spaces.
616
(?:[ ]{%d} | \t) # Subsequent lines must be indented.
620
# Lookahead for non-space at line-start, or end of doc.
621
(?:(?=^[ ]{0,%d}\S)|\Z)
622
''' % (less_than_tab, self.tab_width, self.tab_width),
624
return footnote_def_re.sub(self._extract_footnote_def_sub, text)
628
re.compile(r"^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$", re.M),
629
re.compile(r"^[ ]{0,2}([ ]?\-[ ]?){3,}[ \t]*$", re.M),
630
re.compile(r"^[ ]{0,2}([ ]?\_[ ]?){3,}[ \t]*$", re.M),
633
def _run_block_gamut(self, text):
634
# These are all the transformations that form block-level
635
# tags like paragraphs, headers, and list items.
637
text = self._do_headers(text)
639
# Do Horizontal Rules:
640
hr = "\n<hr"+self.empty_element_suffix+"\n"
641
for hr_re in self._hr_res:
642
text = hr_re.sub(hr, text)
644
text = self._do_lists(text)
646
if "pyshell" in self.extras:
647
text = self._prepare_pyshell_blocks(text)
649
text = self._do_code_blocks(text)
651
text = self._do_block_quotes(text)
653
# We already ran _HashHTMLBlocks() before, in Markdown(), but that
654
# was to escape raw HTML in the original Markdown source. This time,
655
# we're escaping the markup we've just created, so that we don't wrap
656
# <p> tags around block-level tags.
657
text = self._hash_html_blocks(text)
659
text = self._form_paragraphs(text)
663
def _pyshell_block_sub(self, match):
664
lines = match.group(0).splitlines(0)
666
indent = ' ' * self.tab_width
667
s = ('\n' # separate from possible cuddled paragraph
668
+ indent + ('\n'+indent).join(lines)
672
def _prepare_pyshell_blocks(self, text):
673
"""Ensure that Python interactive shell sessions are put in
674
code blocks -- even if not properly indented.
676
if ">>>" not in text:
679
less_than_tab = self.tab_width - 1
680
_pyshell_block_re = re.compile(r"""
681
^([ ]{0,%d})>>>[ ].*\n # first line
682
^(\1.*\S+.*\n)* # any number of subsequent lines
683
^\n # ends with a blank line
684
""" % less_than_tab, re.M | re.X)
686
return _pyshell_block_re.sub(self._pyshell_block_sub, text)
688
def _run_span_gamut(self, text):
689
# These are all the transformations that occur *within* block-level
690
# tags like paragraphs, headers, and list items.
692
text = self._do_code_spans(text)
694
text = self._escape_special_chars(text)
696
# Process anchor and image tags.
697
text = self._do_links(text)
699
# Make links out of things like `<http://example.com/>`
700
# Must come after _do_links(), because you can use < and >
701
# delimiters in inline links like [this](<url>).
702
text = self._do_auto_links(text)
704
if "link-patterns" in self.extras:
705
text = self._do_link_patterns(text)
707
text = self._encode_amps_and_angles(text)
709
text = self._do_italics_and_bold(text)
712
text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
716
# "Sorta" because auto-links are identified as "tag" tokens.
717
_sorta_html_tokenize_re = re.compile(r"""
722
(?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes
725
# auto-link (e.g., <http://www.activestate.com/>)
730
<\?.*?\?> # processing instruction
734
def _escape_special_chars(self, text):
735
# Python markdown note: the HTML tokenization here differs from
736
# that in Markdown.pl, hence the behaviour for subtle cases can
737
# differ (I believe the tokenizer here does a better job because
738
# it isn't susceptible to unmatched '<' and '>' in HTML tags).
739
# Note, however, that '>' is not allowed in an auto-link URL
742
is_html_markup = False
743
for token in self._sorta_html_tokenize_re.split(text):
745
# Within tags/HTML-comments/auto-links, encode * and _
746
# so they don't conflict with their use in Markdown for
747
# italics and strong. We're replacing each such
748
# character with its corresponding MD5 checksum value;
749
# this is likely overkill, but it should prevent us from
750
# colliding with the escape values by accident.
751
escaped.append(token.replace('*', g_escape_table['*'])
752
.replace('_', g_escape_table['_']))
754
escaped.append(self._encode_backslash_escapes(token))
755
is_html_markup = not is_html_markup
756
return ''.join(escaped)
758
def _hash_html_spans(self, text):
759
# Used for safe_mode.
761
def _is_auto_link(s):
762
if ':' in s and self._auto_link_re.match(s):
764
elif '@' in s and self._auto_email_link_re.match(s):
769
is_html_markup = False
770
for token in self._sorta_html_tokenize_re.split(text):
771
if is_html_markup and not _is_auto_link(token):
772
sanitized = self._sanitize_html(token)
773
key = _hash_text(sanitized)
774
self.html_spans[key] = sanitized
778
is_html_markup = not is_html_markup
779
return ''.join(tokens)
781
def _unhash_html_spans(self, text):
782
for key, sanitized in self.html_spans.items():
783
text = text.replace(key, sanitized)
786
def _sanitize_html(self, s):
787
if self.safe_mode == "replace":
788
return self.html_removed_text
789
elif self.safe_mode == "escape":
795
for before, after in replacements:
796
s = s.replace(before, after)
799
raise MarkdownError("invalid value for 'safe_mode': %r (must be "
800
"'escape' or 'replace')" % self.safe_mode)
802
_tail_of_inline_link_re = re.compile(r'''
803
# Match tail of: [text](/url/) or [text](/url/ "title")
813
(['"]) # quote char = \3
816
)? # title is optional
819
_tail_of_reference_link_re = re.compile(r'''
820
# Match tail of: [text][id]
821
[ ]? # one optional space
822
(?:\n[ ]*)? # one optional newline followed by spaces
828
def _do_links(self, text):
829
"""Turn Markdown link shortcuts into XHTML <a> and <img> tags.
831
This is a combination of Markdown.pl's _DoAnchors() and
832
_DoImages(). They are done together because that simplified the
833
approach. It was necessary to use a different approach than
834
Markdown.pl because of the lack of atomic matching support in
835
Python's regex engine used in $g_nested_brackets.
837
MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24
839
# `anchor_allowed_pos` is used to support img links inside
840
# anchors, but not anchors inside anchors. An anchor's start
841
# pos must be `>= anchor_allowed_pos`.
842
anchor_allowed_pos = 0
845
while True: # Handle the next link.
846
# The next '[' is the start of:
847
# - an inline anchor: [text](url "title")
848
# - a reference anchor: [text][id]
849
# - an inline img: 
850
# - a reference img: ![text][id]
851
# - a footnote ref: [^id]
852
# (Only if 'footnotes' extra enabled)
853
# - a footnote defn: [^id]: ...
854
# (Only if 'footnotes' extra enabled) These have already
855
# been stripped in _strip_footnote_definitions() so no
856
# need to watch for them.
857
# - a link definition: [id]: url "title"
858
# These have already been stripped in
859
# _strip_link_definitions() so no need to watch for them.
860
# - not markup: [...anything else...
862
start_idx = text.index('[', curr_pos)
865
text_length = len(text)
867
# Find the matching closing ']'.
868
# Markdown.pl allows *matching* brackets in link text so we
869
# will here too. Markdown.pl *doesn't* currently allow
870
# matching brackets in img alt text -- we'll differ in that
873
for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
878
if bracket_depth < 0:
883
# Closing bracket not found within sentinel length.
885
curr_pos = start_idx + 1
887
link_text = text[start_idx+1:p]
889
# Possibly a footnote ref?
890
if "footnotes" in self.extras and link_text.startswith("^"):
891
normed_id = re.sub(r'\W', '-', link_text[1:])
892
if normed_id in self.footnotes:
893
self.footnote_ids.append(normed_id)
894
result = '<sup class="footnote-ref" id="fnref-%s">' \
895
'<a href="#fn-%s">%s</a></sup>' \
896
% (normed_id, normed_id, len(self.footnote_ids))
897
text = text[:start_idx] + result + text[p+1:]
899
# This id isn't defined, leave the markup alone.
903
# Now determine what this is by the remainder.
908
# Inline anchor or img?
909
if text[p] == '(': # attempt at perf improvement
910
match = self._tail_of_inline_link_re.match(text, p)
912
# Handle an inline anchor or img.
913
is_img = start_idx > 0 and text[start_idx-1] == "!"
917
url, title = match.group("url"), match.group("title")
918
if url and url[0] == '<':
919
url = url[1:-1] # '<url>' -> 'url'
920
# We've got to encode these to avoid conflicting
922
url = url.replace('*', g_escape_table['*']) \
923
.replace('_', g_escape_table['_'])
925
title_str = ' title="%s"' \
926
% title.replace('*', g_escape_table['*']) \
927
.replace('_', g_escape_table['_']) \
928
.replace('"', '"')
932
result = '<img src="%s" alt="%s"%s%s' \
933
% (url, link_text.replace('"', '"'),
934
title_str, self.empty_element_suffix)
935
curr_pos = start_idx + len(result)
936
text = text[:start_idx] + result + text[match.end():]
937
elif start_idx >= anchor_allowed_pos:
938
result_head = '<a href="%s"%s>' % (url, title_str)
939
result = '%s%s</a>' % (result_head, link_text)
940
# <img> allowed from curr_pos on, <a> from
941
# anchor_allowed_pos on.
942
curr_pos = start_idx + len(result_head)
943
anchor_allowed_pos = start_idx + len(result)
944
text = text[:start_idx] + result + text[match.end():]
946
# Anchor not allowed here.
947
curr_pos = start_idx + 1
950
# Reference anchor or img?
952
match = self._tail_of_reference_link_re.match(text, p)
954
# Handle a reference-style anchor or img.
955
is_img = start_idx > 0 and text[start_idx-1] == "!"
958
link_id = match.group("id").lower()
960
link_id = link_text.lower() # for links like [this][]
961
if link_id in self.urls:
962
url = self.urls[link_id]
963
# We've got to encode these to avoid conflicting
965
url = url.replace('*', g_escape_table['*']) \
966
.replace('_', g_escape_table['_'])
967
title = self.titles.get(link_id)
969
title = title.replace('*', g_escape_table['*']) \
970
.replace('_', g_escape_table['_'])
971
title_str = ' title="%s"' % title
975
result = '<img src="%s" alt="%s"%s%s' \
976
% (url, link_text.replace('"', '"'),
977
title_str, self.empty_element_suffix)
978
curr_pos = start_idx + len(result)
979
text = text[:start_idx] + result + text[match.end():]
980
elif start_idx >= anchor_allowed_pos:
981
result = '<a href="%s"%s>%s</a>' \
982
% (url, title_str, link_text)
983
result_head = '<a href="%s"%s>' % (url, title_str)
984
result = '%s%s</a>' % (result_head, link_text)
985
# <img> allowed from curr_pos on, <a> from
986
# anchor_allowed_pos on.
987
curr_pos = start_idx + len(result_head)
988
anchor_allowed_pos = start_idx + len(result)
989
text = text[:start_idx] + result + text[match.end():]
991
# Anchor not allowed here.
992
curr_pos = start_idx + 1
994
# This id isn't defined, leave the markup alone.
995
curr_pos = match.end()
998
# Otherwise, it isn't markup.
999
curr_pos = start_idx + 1
1004
_setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
1005
def _setext_h_sub(self, match):
1006
n = {"=": 1, "-": 2}[match.group(2)[0]]
1007
demote_headers = self.extras.get("demote-headers")
1009
n = min(n + demote_headers, 6)
1010
return "<h%d>%s</h%d>\n\n" \
1011
% (n, self._run_span_gamut(match.group(1)), n)
1013
_atx_h_re = re.compile(r'''
1014
^(\#{1,6}) # \1 = string of #'s
1016
(.+?) # \2 = Header text
1018
(?<!\\) # ensure not an escaped trailing '#'
1019
\#* # optional closing #'s (not counted)
1022
def _atx_h_sub(self, match):
1023
n = len(match.group(1))
1024
demote_headers = self.extras.get("demote-headers")
1026
n = min(n + demote_headers, 6)
1027
return "<h%d>%s</h%d>\n\n" \
1028
% (n, self._run_span_gamut(match.group(2)), n)
1030
def _do_headers(self, text):
1031
# Setext-style headers:
1037
text = self._setext_h_re.sub(self._setext_h_sub, text)
1039
# atx-style headers:
1042
# ## Header 2 with closing hashes ##
1045
text = self._atx_h_re.sub(self._atx_h_sub, text)
1050
_marker_ul_chars = '*+-'
1051
_marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
1052
_marker_ul = '(?:[%s])' % _marker_ul_chars
1053
_marker_ol = r'(?:\d+\.)'
1055
def _list_sub(self, match):
1056
lst = match.group(1)
1057
lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol"
1058
result = self._process_list_items(lst)
1060
return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type)
1062
return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type)
1064
def _do_lists(self, text):
1065
# Form HTML ordered (numbered) and unordered (bulleted) lists.
1067
for marker_pat in (self._marker_ul, self._marker_ol):
1068
# Re-usable pattern to match any entire ul or ol list:
1069
less_than_tab = self.tab_width - 1
1074
(%s) # \3 = first list item marker
1083
(?! # Negative lookahead for another list item marker
1089
''' % (less_than_tab, marker_pat, marker_pat)
1091
# We use a different prefix before nested lists than top-level lists.
1092
# See extended comment in _process_list_items().
1094
# Note: There's a bit of duplication here. My original implementation
1095
# created a scalar regex pattern as the conditional result of the test on
1096
# $g_list_level, and then only ran the $text =~ s{...}{...}egmx
1097
# substitution once, using the scalar as the pattern. This worked,
1098
# everywhere except when running under MT on my hosting account at Pair
1099
# Networks. There, this caused all rebuilds to be killed by the reaper (or
1100
# perhaps they crashed, but that seems incredibly unlikely given that the
1101
# same script on the same server ran fine *except* under MT. I've spent
1102
# more time trying to figure out why this is happening than I'd like to
1103
# admit. My only guess, backed up by the fact that this workaround works,
1104
# is that Perl optimizes the substition when it can figure out that the
1105
# pattern will never change, and when this optimization isn't on, we run
1106
# afoul of the reaper. Thus, the slightly redundant code to that uses two
1107
# static s/// patterns rather than one conditional pattern.
1110
sub_list_re = re.compile("^"+whole_list, re.X | re.M | re.S)
1111
text = sub_list_re.sub(self._list_sub, text)
1113
list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list,
1115
text = list_re.sub(self._list_sub, text)
1119
_list_item_re = re.compile(r'''
1120
(\n)? # leading line = \1
1121
(^[ \t]*) # leading whitespace = \2
1122
(%s) [ \t]+ # list marker = \3
1123
((?:.+?) # list item text = \4
1124
(\n{1,2})) # eols = \5
1125
(?= \n* (\Z | \2 (%s) [ \t]+))
1126
''' % (_marker_any, _marker_any),
1129
_last_li_endswith_two_eols = False
1130
def _list_item_sub(self, match):
1131
item = match.group(4)
1132
leading_line = match.group(1)
1133
leading_space = match.group(2)
1134
if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
1135
item = self._run_block_gamut(self._outdent(item))
1137
# Recursion for sub-lists:
1138
item = self._do_lists(self._outdent(item))
1139
if item.endswith('\n'):
1141
item = self._run_span_gamut(item)
1142
self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
1143
return "<li>%s</li>\n" % item
1145
def _process_list_items(self, list_str):
1146
# Process the contents of a single ordered or unordered list,
1147
# splitting it into individual list items.
1149
# The $g_list_level global keeps track of when we're inside a list.
1150
# Each time we enter a list, we increment it; when we leave a list,
1151
# we decrement. If it's zero, we're not in a list anymore.
1153
# We do this because when we're not inside a list, we want to treat
1154
# something like this:
1156
# I recommend upgrading to version
1157
# 8. Oops, now this line is treated
1160
# As a single paragraph, despite the fact that the second line starts
1161
# with a digit-period-space sequence.
1163
# Whereas when we're inside a list (or sub-list), that line will be
1164
# treated as the start of a sub-list. What a kludge, huh? This is
1165
# an aspect of Markdown's syntax that's hard to parse perfectly
1166
# without resorting to mind-reading. Perhaps the solution is to
1167
# change the syntax rules such that sub-lists must start with a
1168
# starting cardinal number; e.g. "1." or "a.".
1169
self.list_level += 1
1170
self._last_li_endswith_two_eols = False
1171
list_str = list_str.rstrip('\n') + '\n'
1172
list_str = self._list_item_re.sub(self._list_item_sub, list_str)
1173
self.list_level -= 1
1176
def _get_pygments_lexer(self, lexer_name):
1178
from pygments import lexers, util
1182
return lexers.get_lexer_by_name(lexer_name)
1183
except util.ClassNotFound:
1186
def _color_with_pygments(self, codeblock, lexer, **formatter_opts):
1188
import pygments.formatters
1190
class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):
1191
def _wrap_code(self, inner):
1192
"""A function for use in a Pygments Formatter which
1193
wraps in <code> tags.
1200
def wrap(self, source, outfile):
1201
"""Return the source with a code, pre, and div."""
1202
return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
1204
formatter = HtmlCodeFormatter(cssclass="codehilite", **formatter_opts)
1205
return pygments.highlight(codeblock, lexer, formatter)
1207
def _code_block_sub(self, match):
1208
codeblock = match.group(1)
1209
codeblock = self._outdent(codeblock)
1210
codeblock = self._detab(codeblock)
1211
codeblock = codeblock.lstrip('\n') # trim leading newlines
1212
codeblock = codeblock.rstrip() # trim trailing whitespace
1214
if "code-color" in self.extras and codeblock.startswith(":::"):
1215
lexer_name, rest = codeblock.split('\n', 1)
1216
lexer_name = lexer_name[3:].strip()
1217
lexer = self._get_pygments_lexer(lexer_name)
1218
codeblock = rest.lstrip("\n") # Remove lexer declaration line.
1220
formatter_opts = self.extras['code-color'] or {}
1221
colored = self._color_with_pygments(codeblock, lexer,
1223
return "\n\n%s\n\n" % colored
1225
codeblock = self._encode_code(codeblock)
1226
return "\n\n<pre><code>%s\n</code></pre>\n\n" % codeblock
1228
def _do_code_blocks(self, text):
1229
"""Process Markdown `<pre><code>` blocks."""
1230
code_block_re = re.compile(r'''
1232
( # $1 = the code block -- one or more lines, starting with a space/tab
1234
(?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces
1238
((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1239
''' % (self.tab_width, self.tab_width),
1242
return code_block_re.sub(self._code_block_sub, text)
1245
# Rules for a code span:
1246
# - backslash escapes are not interpreted in a code span
1247
# - to include one or or a run of more backticks the delimiters must
1248
# be a longer run of backticks
1249
# - cannot start or end a code span with a backtick; pad with a
1250
# space and that space will be removed in the emitted HTML
1251
# See `test/tm-cases/escapes.text` for a number of edge-case
1253
_code_span_re = re.compile(r'''
1255
(`+) # \1 = Opening run of `
1256
(?!`) # See Note A test/tm-cases/escapes.text
1257
(.+?) # \2 = The code block
1259
\1 # Matching closer
1263
def _code_span_sub(self, match):
1264
c = match.group(2).strip(" \t")
1265
c = self._encode_code(c)
1266
return "<code>%s</code>" % c
1268
def _do_code_spans(self, text):
1269
# * Backtick quotes are used for <code></code> spans.
1271
# * You can use multiple backticks as the delimiters if you want to
1272
# include literal backticks in the code span. So, this input:
1274
# Just type ``foo `bar` baz`` at the prompt.
1276
# Will translate to:
1278
# <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1280
# There's no arbitrary limit to the number of backticks you
1281
# can use as delimters. If you need three consecutive backticks
1282
# in your code, use four for delimiters, etc.
1284
# * You can use spaces to get literal backticks at the edges:
1286
# ... type `` `bar` `` ...
1290
# ... type <code>`bar`</code> ...
1291
return self._code_span_re.sub(self._code_span_sub, text)
1293
def _encode_code(self, text):
1294
"""Encode/escape certain characters inside Markdown code runs.
1295
The point is that in code, these characters are literals,
1296
and lose their special Markdown meanings.
1299
# Encode all ampersands; HTML entities are not
1300
# entities within a Markdown code span.
1302
# Do the angle bracket song and dance:
1305
# Now, escape characters that are magic in Markdown:
1306
('*', g_escape_table['*']),
1307
('_', g_escape_table['_']),
1308
('{', g_escape_table['{']),
1309
('}', g_escape_table['}']),
1310
('[', g_escape_table['[']),
1311
(']', g_escape_table[']']),
1312
('\\', g_escape_table['\\']),
1314
for before, after in replacements:
1315
text = text.replace(before, after)
1318
_strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
1319
_em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
1320
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
1321
_code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
1322
def _do_italics_and_bold(self, text):
1323
# <strong> must go first:
1324
if "code-friendly" in self.extras:
1325
text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text)
1326
text = self._code_friendly_em_re.sub(r"<em>\1</em>", text)
1328
text = self._strong_re.sub(r"<strong>\2</strong>", text)
1329
text = self._em_re.sub(r"<em>\2</em>", text)
1333
_block_quote_re = re.compile(r'''
1334
( # Wrap whole match in \1
1336
^[ \t]*>[ \t]? # '>' at the start of a line
1337
.+\n # rest of the first line
1338
(.+\n)* # subsequent consecutive lines
1343
_bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
1345
_html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
1346
def _dedent_two_spaces_sub(self, match):
1347
return re.sub(r'(?m)^ ', '', match.group(1))
1349
def _block_quote_sub(self, match):
1351
bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting
1352
bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines
1353
bq = self._run_block_gamut(bq) # recurse
1355
bq = re.sub('(?m)^', ' ', bq)
1356
# These leading spaces screw with <pre> content, so we need to fix that:
1357
bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
1359
return "<blockquote>\n%s\n</blockquote>\n\n" % bq
1361
def _do_block_quotes(self, text):
1364
return self._block_quote_re.sub(self._block_quote_sub, text)
1366
def _form_paragraphs(self, text):
1367
# Strip leading and trailing lines:
1368
text = text.strip('\n')
1371
grafs = re.split(r"\n{2,}", text)
1372
for i, graf in enumerate(grafs):
1373
if graf in self.html_blocks:
1374
# Unhashify HTML blocks
1375
grafs[i] = self.html_blocks[graf]
1378
graf = self._run_span_gamut(graf)
1379
grafs[i] = "<p>" + graf.lstrip(" \t") + "</p>"
1381
return "\n\n".join(grafs)
1383
def _add_footnotes(self, text):
1386
'<div class="footnotes">',
1387
'<hr' + self.empty_element_suffix,
1390
for i, id in enumerate(self.footnote_ids):
1393
footer.append('<li id="fn-%s">' % id)
1394
footer.append(self._run_block_gamut(self.footnotes[id]))
1395
backlink = ('<a href="#fnref-%s" '
1396
'class="footnoteBackLink" '
1397
'title="Jump back to footnote %d in the text.">'
1398
'↩</a>' % (id, i+1))
1399
if footer[-1].endswith("</p>"):
1400
footer[-1] = footer[-1][:-len("</p>")] \
1401
+ ' ' + backlink + "</p>"
1403
footer.append("\n<p>%s</p>" % backlink)
1404
footer.append('</li>')
1405
footer.append('</ol>')
1406
footer.append('</div>')
1407
return text + '\n\n' + '\n'.join(footer)
1411
# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1412
# http://bumppo.net/projects/amputator/
1413
_ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
1414
_naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
1415
_naked_gt_re = re.compile(r'''(?<![a-z?!/'"-])>''', re.I)
1417
def _encode_amps_and_angles(self, text):
1418
# Smart processing for ampersands and angle brackets that need
1420
text = self._ampersand_re.sub('&', text)
1423
text = self._naked_lt_re.sub('<', text)
1426
# Note: Other markdown implementations (e.g. Markdown.pl, PHP
1427
# Markdown) don't do this.
1428
text = self._naked_gt_re.sub('>', text)
1431
def _encode_backslash_escapes(self, text):
1432
for ch, escape in g_escape_table.items():
1433
text = text.replace("\\"+ch, escape)
1436
_auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
1437
def _auto_link_sub(self, match):
1439
return '<a href="%s">%s</a>' % (g1, g1)
1441
_auto_email_link_re = re.compile(r"""
1447
[-\w]+(\.[-\w]+)*\.[a-z]+
1450
""", re.I | re.X | re.U)
1451
def _auto_email_link_sub(self, match):
1452
return self._encode_email_address(
1453
self._unescape_special_chars(match.group(1)))
1455
def _do_auto_links(self, text):
1456
text = self._auto_link_re.sub(self._auto_link_sub, text)
1457
text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
1460
def _encode_email_address(self, addr):
1461
# Input: an email address, e.g. "foo@example.com"
1463
# Output: the email address as a mailto link, with each character
1464
# of the address encoded as either a decimal or hex entity, in
1465
# the hopes of foiling most address harvesting spam bots. E.g.:
1467
# <a href="mailto:foo@e
1468
# xample.com">foo
1469
# @example.com</a>
1471
# Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1472
# mailing list: <http://tinyurl.com/yu7ue>
1473
chars = [_xml_encode_email_char_at_random(ch)
1474
for ch in "mailto:" + addr]
1475
# Strip the mailto: from the visible part.
1476
addr = '<a href="%s">%s</a>' \
1477
% (''.join(chars), ''.join(chars[7:]))
1480
def _do_link_patterns(self, text):
1481
"""Caveat emptor: there isn't much guarding against link
1482
patterns being formed inside other standard Markdown links, e.g.
1483
inside a [link def][like this].
1485
Dev Notes: *Could* consider prefixing regexes with a negative
1486
lookbehind assertion to attempt to guard against this.
1489
for regex, repl in self.link_patterns:
1491
for match in regex.finditer(text):
1492
if hasattr(repl, "__call__"):
1495
href = match.expand(repl)
1496
replacements.append((match.span(), href))
1497
for (start, end), href in reversed(replacements):
1499
href.replace('"', '"') # b/c of attr quote
1500
# To avoid markdown <em> and <strong>:
1501
.replace('*', g_escape_table['*'])
1502
.replace('_', g_escape_table['_']))
1503
link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
1504
hash = md5(link).hexdigest()
1505
link_from_hash[hash] = link
1506
text = text[:start] + hash + text[end:]
1507
for hash, link in link_from_hash.items():
1508
text = text.replace(hash, link)
1511
def _unescape_special_chars(self, text):
1512
# Swap back in all the special characters we've hidden.
1513
for ch, hash in g_escape_table.items():
1514
text = text.replace(hash, ch)
1517
def _outdent(self, text):
1518
# Remove one level of line-leading tabs or spaces
1519
return self._outdent_re.sub('', text)
1522
class MarkdownWithExtras(Markdown):
1523
"""A markdowner class that enables most extras:
1526
- code-color (only has effect if 'pygments' Python module on path)
1528
These are not included:
1529
- pyshell (specific to Python-related documenting)
1530
- code-friendly (because it *disables* part of the syntax)
1531
- link-patterns (because you need to specify some actual
1532
link-patterns anyway)
1534
extras = ["footnotes", "code-color"]
1537
#---- internal support functions
1539
# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
1540
def _curry(*args, **kwargs):
1541
function, args = args[0], args[1:]
1542
def result(*rest, **kwrest):
1543
combined = kwargs.copy()
1544
combined.update(kwrest)
1545
return function(*args + rest, **combined)
1548
# Recipe: regex_from_encoded_pattern (1.0)
1549
def _regex_from_encoded_pattern(s):
1550
"""'foo' -> re.compile(re.escape('foo'))
1551
'/foo/' -> re.compile('foo')
1552
'/foo/i' -> re.compile('foo', re.I)
1554
if s.startswith('/') and s.rfind('/') != 0:
1555
# Parse it: /PATTERN/FLAGS
1557
pattern, flags_str = s[1:idx], s[idx+1:]
1566
for char in flags_str:
1568
flags |= flag_from_char[char]
1570
raise ValueError("unsupported regex flag: '%s' in '%s' "
1571
"(must be one of '%s')"
1572
% (char, s, ''.join(flag_from_char.keys())))
1573
return re.compile(s[1:idx], flags)
1574
else: # not an encoded regex
1575
return re.compile(re.escape(s))
1577
# Recipe: dedent (0.1.2)
1578
def _dedentlines(lines, tabsize=8, skip_first_line=False):
1579
"""_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
1581
"lines" is a list of lines to dedent.
1582
"tabsize" is the tab width to use for indent width calculations.
1583
"skip_first_line" is a boolean indicating if the first line should
1584
be skipped for calculating the indent width and for dedenting.
1585
This is sometimes useful for docstrings and similar.
1587
Same as dedent() except operates on a sequence of lines. Note: the
1588
lines list is modified **in-place**.
1592
print "dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
1593
% (tabsize, skip_first_line)
1596
for i, line in enumerate(lines):
1597
if i == 0 and skip_first_line: continue
1603
indent += tabsize - (indent % tabsize)
1605
continue # skip all-whitespace lines
1609
continue # skip all-whitespace lines
1610
if DEBUG: print "dedent: indent=%d: %r" % (indent, line)
1614
margin = min(margin, indent)
1615
if DEBUG: print "dedent: margin=%r" % margin
1617
if margin is not None and margin > 0:
1618
for i, line in enumerate(lines):
1619
if i == 0 and skip_first_line: continue
1621
for j, ch in enumerate(line):
1625
removed += tabsize - (removed % tabsize)
1627
if DEBUG: print "dedent: %r: EOL -> strip up to EOL" % line
1628
lines[i] = lines[i][j:]
1631
raise ValueError("unexpected non-whitespace char %r in "
1632
"line %r while removing %d-space margin"
1633
% (ch, line, margin))
1635
print "dedent: %r: %r -> removed %d/%d"\
1636
% (line, ch, removed, margin)
1637
if removed == margin:
1638
lines[i] = lines[i][j+1:]
1640
elif removed > margin:
1641
lines[i] = ' '*(removed-margin) + lines[i][j+1:]
1645
lines[i] = lines[i][removed:]
1648
def _dedent(text, tabsize=8, skip_first_line=False):
1649
"""_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
1651
"text" is the text to dedent.
1652
"tabsize" is the tab width to use for indent width calculations.
1653
"skip_first_line" is a boolean indicating if the first line should
1654
be skipped for calculating the indent width and for dedenting.
1655
This is sometimes useful for docstrings and similar.
1657
textwrap.dedent(s), but don't expand tabs to spaces
1659
lines = text.splitlines(1)
1660
_dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
1661
return ''.join(lines)
1664
class _memoized(object):
1665
"""Decorator that caches a function's return value each time it is called.
1666
If called later with the same arguments, the cached value is returned, and
1669
http://wiki.python.org/moin/PythonDecoratorLibrary
1671
def __init__(self, func):
1674
def __call__(self, *args):
1676
return self.cache[args]
1678
self.cache[args] = value = self.func(*args)
1681
# uncachable -- for instance, passing a list as an argument.
1682
# Better to not cache than to blow up entirely.
1683
return self.func(*args)
1685
"""Return the function's docstring."""
1686
return self.func.__doc__
1689
def _xml_oneliner_re_from_tab_width(tab_width):
1690
"""Standalone XML processing instruction regex."""
1691
return re.compile(r"""
1693
(?<=\n\n) # Starting after a blank line
1695
\A\n? # the beginning of the doc
1700
<\?\w+\b\s+.*?\?> # XML processing instruction
1702
<\w+:\w+\b\s+.*?/> # namespaced single tag
1705
(?=\n{2,}|\Z) # followed by a blank line or end of document
1707
""" % (tab_width - 1), re.X)
1708
_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
1710
def _hr_tag_re_from_tab_width(tab_width):
1711
return re.compile(r"""
1713
(?<=\n\n) # Starting after a blank line
1715
\A\n? # the beginning of the doc
1719
<(hr) # start tag = \2
1722
/?> # the matching end tag
1724
(?=\n{2,}|\Z) # followed by a blank line or end of document
1726
""" % (tab_width - 1), re.X)
1727
_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
1730
def _xml_encode_email_char_at_random(ch):
1732
# Roughly 10% raw, 45% hex, 45% dec.
1733
# '@' *must* be encoded. I [John Gruber] insist.
1734
# Issue 26: '_' must be encoded.
1735
if r > 0.9 and ch not in "@_":
1738
# The [1:] is to drop leading '0': 0x63 -> x63
1739
return '&#%s;' % hex(ord(ch))[1:]
1741
return '&#%s;' % ord(ch)
1743
def _hash_text(text):
1744
return 'md5:'+md5(text.encode("utf-8")).hexdigest()
1749
class _NoReflowFormatter(optparse.IndentedHelpFormatter):
1750
"""An optparse formatter that does NOT reflow the description."""
1751
def format_description(self, description):
1752
return description or ""
1758
def main(argv=None):
1761
if not logging.root.handlers:
1762
logging.basicConfig()
1764
usage = "usage: %prog [PATHS...]"
1765
version = "%prog "+__version__
1766
parser = optparse.OptionParser(prog="markdown2", usage=usage,
1767
version=version, description=cmdln_desc,
1768
formatter=_NoReflowFormatter())
1769
parser.add_option("-v", "--verbose", dest="log_level",
1770
action="store_const", const=logging.DEBUG,
1771
help="more verbose output")
1772
parser.add_option("--encoding",
1773
help="specify encoding of text content")
1774
parser.add_option("--html4tags", action="store_true", default=False,
1775
help="use HTML 4 style for empty element tags")
1776
parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",
1777
help="sanitize literal HTML: 'escape' escapes "
1778
"HTML meta chars, 'replace' replaces with an "
1779
"[HTML_REMOVED] note")
1780
parser.add_option("-x", "--extras", action="append",
1781
help="Turn on specific extra features (not part of "
1782
"the core Markdown spec). Supported values: "
1783
"'code-friendly' disables _/__ for emphasis; "
1784
"'code-color' adds code-block syntax coloring; "
1785
"'link-patterns' adds auto-linking based on patterns; "
1786
"'footnotes' adds the footnotes syntax;"
1787
"'xml' passes one-liner processing instructions and namespaced XML tags;"
1788
"'pyshell' to put unindented Python interactive shell sessions in a <code> block.")
1789
parser.add_option("--use-file-vars",
1790
help="Look for and use Emacs-style 'markdown-extras' "
1791
"file var to turn on extras. See "
1792
"<http://code.google.com/p/python-markdown2/wiki/Extras>.")
1793
parser.add_option("--link-patterns-file",
1794
help="path to a link pattern file")
1795
parser.add_option("--self-test", action="store_true",
1796
help="run internal self-tests (some doctests)")
1797
parser.add_option("--compare", action="store_true",
1798
help="run against Markdown.pl as well (for testing)")
1799
parser.set_defaults(log_level=logging.INFO, compare=False,
1800
encoding="utf-8", safe_mode=None, use_file_vars=False)
1801
opts, paths = parser.parse_args()
1802
log.setLevel(opts.log_level)
1809
for s in opts.extras:
1810
splitter = re.compile("[,;: ]+")
1811
for e in splitter.split(s):
1813
ename, earg = e.split('=', 1)
1819
ename, earg = e, None
1820
extras[ename] = earg
1824
if opts.link_patterns_file:
1826
f = open(opts.link_patterns_file)
1828
for i, line in enumerate(f.readlines()):
1829
if not line.strip(): continue
1830
if line.lstrip().startswith("#"): continue
1832
pat, href = line.rstrip().rsplit(None, 1)
1834
raise MarkdownError("%s:%d: invalid link pattern line: %r"
1835
% (opts.link_patterns_file, i+1, line))
1836
link_patterns.append(
1837
(_regex_from_encoded_pattern(pat), href))
1841
link_patterns = None
1843
from os.path import join, dirname, abspath, exists
1844
markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
1848
print "==== Markdown.pl ===="
1849
perl_cmd = 'perl %s "%s"' % (markdown_pl, path)
1850
o = os.popen(perl_cmd)
1851
perl_html = o.read()
1853
sys.stdout.write(perl_html)
1854
print "==== markdown2.py ===="
1855
html = markdown_path(path, encoding=opts.encoding,
1856
html4tags=opts.html4tags,
1857
safe_mode=opts.safe_mode,
1858
extras=extras, link_patterns=link_patterns,
1859
use_file_vars=opts.use_file_vars)
1861
html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
1863
test_dir = join(dirname(dirname(abspath(__file__))), "test")
1864
if exists(join(test_dir, "test_markdown2.py")):
1865
sys.path.insert(0, test_dir)
1866
from test_markdown2 import norm_html_from_html
1867
norm_html = norm_html_from_html(html)
1868
norm_perl_html = norm_html_from_html(perl_html)
1871
norm_perl_html = perl_html
1872
print "==== match? %r ====" % (norm_perl_html == norm_html)
1875
if __name__ == "__main__":
1876
sys.exit( main(sys.argv) )