5
Python Markdown converts Markdown to HTML and can be used as a library or
6
called from the command line.
8
## Basic usage as a module:
12
html = md.convert(your_text_string)
14
## Basic use from the command line:
16
python markdown.py source.txt > destination.html
18
Run "python markdown.py --help" to see more options.
22
See <http://www.freewisdom.org/projects/python-markdown/> for more
23
information and instructions on how to extend the functionality of
24
Python Markdown. Read that before you try modifying this file.
26
## Authors and License
28
Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
29
maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
30
Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
32
Contact: markdown@freewisdom.org
34
Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
35
Copyright 200? Django Software Foundation (OrderedDict implementation)
36
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
37
Copyright 2004 Manfred Stienstra (the original version)
39
License: BSD (see docs/LICENSE for details).
43
version_info = (2,0,0, "Final")
50
from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
55
=============================================================================
59
Constants you might want to modify
60
-----------------------------------------------------------------------------
63
# default logging level for command-line use
64
COMMAND_LINE_LOGGING_LEVEL = CRITICAL
65
TAB_LENGTH = 4 # expand tabs to this many spaces
66
ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz">
67
SMART_EMPHASIS = True # this_or_that does not become this<i>or</i>that
68
DEFAULT_OUTPUT_FORMAT = 'xhtml1' # xhtml or html4 output
69
HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
70
BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
71
"|script|noscript|form|fieldset|iframe|math"
72
"|ins|del|hr|hr/|style|li|dt|dd|thead|tbody"
74
DOC_TAG = "div" # Element used to wrap document - later removed
77
STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder
78
ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder
79
INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
80
INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
81
AMP_SUBSTITUTE = STX+"amp"+ETX
85
Constants you probably do not need to change
86
-----------------------------------------------------------------------------
89
RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
90
# Hebrew (0590-05FF), Arabic (0600-06FF),
91
# Syriac (0700-074F), Arabic supplement (0750-077F),
92
# Thaana (0780-07BF), Nko (07C0-07FF).
93
(u'\u2D30', u'\u2D7F'), # Tifinagh
98
AUXILIARY GLOBAL FUNCTIONS
99
=============================================================================
103
def message(level, text):
104
""" A wrapper method for logging debug messages. """
105
logger = logging.getLogger('MARKDOWN')
107
# The logger is configured
108
logger.log(level, text)
112
raise MarkdownException, text
114
warnings.warn(text, MarkdownWarning)
117
def isBlockLevel(tag):
118
"""Check if the tag is a block level HTML tag."""
119
return BLOCK_LEVEL_ELEMENTS.match(tag)
122
MISC AUXILIARY CLASSES
123
=============================================================================
126
class AtomicString(unicode):
127
"""A string which should not be further processed."""
131
class MarkdownException(Exception):
132
""" A Markdown Exception. """
136
class MarkdownWarning(Warning):
137
""" A Markdown Warning. """
143
=============================================================================
145
Markdown processing takes place in four steps:
147
1. A bunch of "preprocessors" munge the input text.
148
2. BlockParser() parses the high-level structural elements of the
149
pre-processed text into an ElementTree.
150
3. A bunch of "treeprocessors" are run against the ElementTree. One such
151
treeprocessor runs InlinePatterns against the ElementTree, detecting inline
153
4. Some post-processors are run against the text after the ElementTree has
154
been serialized into text.
155
5. The output is written to a string.
157
Those steps are put together by the Markdown() class.
162
import blockprocessors
163
import treeprocessors
164
import inlinepatterns
165
import postprocessors
170
# Extensions should use "markdown.etree" instead of "etree" (or do `from
171
# markdown import etree`). Do not import it by yourself.
173
etree = etree_loader.importETree()
175
# Adds the ability to output html4
180
"""Convert Markdown to HTML."""
184
extension_configs={},
186
output_format=DEFAULT_OUTPUT_FORMAT):
188
Creates a new Markdown instance.
192
* extensions: A list of extensions.
193
If they are of type string, the module mdx_name.py will be loaded.
194
If they are a subclass of markdown.Extension, they will be used
196
* extension-configs: Configuration setting for extensions.
197
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
198
* output_format: Format of output. Supported formats are:
199
* "xhtml1": Outputs XHTML 1.x. Default.
200
* "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
201
* "html4": Outputs HTML 4
202
* "html": Outputs latest supported version of HTML (currently HTML 4).
203
Note that it is suggested that the more specific formats ("xhtml1"
204
and "html4") be used as "xhtml" or "html" may change in the future
205
if it makes sense at that time.
209
self.safeMode = safe_mode
210
self.registeredExtensions = []
212
self.stripTopLevelTags = True
215
self.preprocessors = odict.OrderedDict()
216
self.preprocessors["html_block"] = \
217
preprocessors.HtmlBlockPreprocessor(self)
218
self.preprocessors["reference"] = \
219
preprocessors.ReferencePreprocessor(self)
220
# footnote preprocessor will be inserted with "<reference"
222
# Block processors - ran by the parser
223
self.parser = blockparser.BlockParser()
224
self.parser.blockprocessors['empty'] = \
225
blockprocessors.EmptyBlockProcessor(self.parser)
226
self.parser.blockprocessors['indent'] = \
227
blockprocessors.ListIndentProcessor(self.parser)
228
self.parser.blockprocessors['code'] = \
229
blockprocessors.CodeBlockProcessor(self.parser)
230
self.parser.blockprocessors['hashheader'] = \
231
blockprocessors.HashHeaderProcessor(self.parser)
232
self.parser.blockprocessors['setextheader'] = \
233
blockprocessors.SetextHeaderProcessor(self.parser)
234
self.parser.blockprocessors['hr'] = \
235
blockprocessors.HRProcessor(self.parser)
236
self.parser.blockprocessors['olist'] = \
237
blockprocessors.OListProcessor(self.parser)
238
self.parser.blockprocessors['ulist'] = \
239
blockprocessors.UListProcessor(self.parser)
240
self.parser.blockprocessors['quote'] = \
241
blockprocessors.BlockQuoteProcessor(self.parser)
242
self.parser.blockprocessors['paragraph'] = \
243
blockprocessors.ParagraphProcessor(self.parser)
246
#self.prePatterns = []
248
# Inline patterns - Run on the tree
249
self.inlinePatterns = odict.OrderedDict()
250
self.inlinePatterns["backtick"] = \
251
inlinepatterns.BacktickPattern(inlinepatterns.BACKTICK_RE)
252
self.inlinePatterns["escape"] = \
253
inlinepatterns.SimpleTextPattern(inlinepatterns.ESCAPE_RE)
254
self.inlinePatterns["reference"] = \
255
inlinepatterns.ReferencePattern(inlinepatterns.REFERENCE_RE, self)
256
self.inlinePatterns["link"] = \
257
inlinepatterns.LinkPattern(inlinepatterns.LINK_RE, self)
258
self.inlinePatterns["image_link"] = \
259
inlinepatterns.ImagePattern(inlinepatterns.IMAGE_LINK_RE, self)
260
self.inlinePatterns["image_reference"] = \
261
inlinepatterns.ImageReferencePattern(inlinepatterns.IMAGE_REFERENCE_RE, self)
262
self.inlinePatterns["autolink"] = \
263
inlinepatterns.AutolinkPattern(inlinepatterns.AUTOLINK_RE, self)
264
self.inlinePatterns["automail"] = \
265
inlinepatterns.AutomailPattern(inlinepatterns.AUTOMAIL_RE, self)
266
self.inlinePatterns["linebreak2"] = \
267
inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_2_RE, 'br')
268
self.inlinePatterns["linebreak"] = \
269
inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_RE, 'br')
270
self.inlinePatterns["html"] = \
271
inlinepatterns.HtmlPattern(inlinepatterns.HTML_RE, self)
272
self.inlinePatterns["entity"] = \
273
inlinepatterns.HtmlPattern(inlinepatterns.ENTITY_RE, self)
274
self.inlinePatterns["not_strong"] = \
275
inlinepatterns.SimpleTextPattern(inlinepatterns.NOT_STRONG_RE)
276
self.inlinePatterns["strong_em"] = \
277
inlinepatterns.DoubleTagPattern(inlinepatterns.STRONG_EM_RE, 'strong,em')
278
self.inlinePatterns["strong"] = \
279
inlinepatterns.SimpleTagPattern(inlinepatterns.STRONG_RE, 'strong')
280
self.inlinePatterns["emphasis"] = \
281
inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_RE, 'em')
282
self.inlinePatterns["emphasis2"] = \
283
inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_2_RE, 'em')
284
# The order of the handlers matters!!!
287
# Tree processors - run once we have a basic parse.
288
self.treeprocessors = odict.OrderedDict()
289
self.treeprocessors["inline"] = treeprocessors.InlineProcessor(self)
290
self.treeprocessors["prettify"] = \
291
treeprocessors.PrettifyTreeprocessor(self)
293
# Postprocessors - finishing touches.
294
self.postprocessors = odict.OrderedDict()
295
self.postprocessors["raw_html"] = \
296
postprocessors.RawHtmlPostprocessor(self)
297
self.postprocessors["amp_substitute"] = \
298
postprocessors.AndSubstitutePostprocessor()
299
# footnote postprocessor will be inserted with ">amp_substitute"
301
# Map format keys to serializers
302
self.output_formats = {
303
'html' : html4.to_html_string,
304
'html4' : html4.to_html_string,
305
'xhtml' : etree.tostring,
306
'xhtml1': etree.tostring,
310
self.htmlStash = preprocessors.HtmlStash()
311
self.registerExtensions(extensions = extensions,
312
configs = extension_configs)
313
self.set_output_format(output_format)
316
def registerExtensions(self, extensions, configs):
318
Register extensions with this instance of Markdown.
322
* extensions: A list of extensions, which can either
323
be strings or objects. See the docstring on Markdown.
324
* configs: A dictionary mapping module names to config options.
327
for ext in extensions:
328
if isinstance(ext, basestring):
329
ext = load_extension(ext, configs.get(ext, []))
331
ext.extendMarkdown(self, globals())
332
except AttributeError:
333
message(ERROR, "Incorrect type! Extension '%s' is "
334
"neither a string or an Extension." %(repr(ext)))
337
def registerExtension(self, extension):
338
""" This gets called by the extension """
339
self.registeredExtensions.append(extension)
343
Resets all state variables so that we can start with a new text.
345
self.htmlStash.reset()
346
self.references.clear()
348
for extension in self.registeredExtensions:
351
def set_output_format(self, format):
352
""" Set the output format for the class instance. """
354
self.serializer = self.output_formats[format.lower()]
356
message(CRITICAL, 'Invalid Output Format: "%s". Use one of %s.' \
357
% (format, self.output_formats.keys()))
359
def convert(self, source):
361
Convert markdown to serialized XHTML or HTML.
365
* source: Source text as a Unicode string.
369
# Fixup the source text
370
if not source.strip():
371
return u"" # a blank unicode string
373
source = unicode(source)
374
except UnicodeDecodeError:
375
message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
378
source = source.replace(STX, "").replace(ETX, "")
379
source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
380
source = re.sub(r'\n\s+\n', '\n\n', source)
381
source = source.expandtabs(TAB_LENGTH)
383
# Split into lines and run the line preprocessors.
384
self.lines = source.split("\n")
385
for prep in self.preprocessors.values():
386
self.lines = prep.run(self.lines)
388
# Parse the high-level elements.
389
root = self.parser.parseDocument(self.lines).getroot()
391
# Run the tree-processors
392
for treeprocessor in self.treeprocessors.values():
393
newRoot = treeprocessor.run(root)
397
# Serialize _properly_. Strip top-level tags.
398
output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf8"))
399
if self.stripTopLevelTags:
400
start = output.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2
401
end = output.rindex('</%s>'%DOC_TAG)
402
output = output[start:end].strip()
404
# Run the text post-processors
405
for pp in self.postprocessors.values():
406
output = pp.run(output)
408
return output.strip()
410
def convertFile(self, input=None, output=None, encoding=None):
411
"""Converts a markdown file and returns the HTML as a unicode string.
413
Decodes the file using the provided encoding (defaults to utf-8),
414
passes the file content to markdown, and outputs the html to either
415
the provided stream or the file with provided name, using the same
416
encoding as the source file.
418
**Note:** This is the only place that decoding and encoding of unicode
419
takes place in Python-Markdown. (All other code is unicode-in /
424
* input: Name of source text file.
425
* output: Name of output file. Writes to stdout if `None`.
426
* encoding: Encoding of input and output files. Defaults to utf-8.
430
encoding = encoding or "utf-8"
433
input_file = codecs.open(input, mode="r", encoding=encoding)
434
text = input_file.read()
436
text = text.lstrip(u'\ufeff') # remove the byte-order mark
439
html = self.convert(text)
441
# Write to file or stdout
442
if isinstance(output, (str, unicode)):
443
output_file = codecs.open(output, "w", encoding=encoding)
444
output_file.write(html)
447
output.write(html.encode(encoding))
452
-----------------------------------------------------------------------------
456
""" Base class for extensions to subclass. """
457
def __init__(self, configs = {}):
458
"""Create an instance of an Extention.
462
* configs: A dict of configuration setting used by an Extension.
464
self.config = configs
466
def getConfig(self, key):
467
""" Return a setting for the given key or an empty string. """
468
if key in self.config:
469
return self.config[key][0]
473
def getConfigInfo(self):
474
""" Return all config settings as a list of tuples. """
475
return [(key, self.config[key][1]) for key in self.config.keys()]
477
def setConfig(self, key, value):
478
""" Set a config setting for `key` with the given `value`. """
479
self.config[key][0] = value
481
def extendMarkdown(self, md, md_globals):
483
Add the various proccesors and patterns to the Markdown Instance.
485
This method must be overriden by every extension.
489
* md: The Markdown instance.
491
* md_globals: Global variables in the markdown module namespace.
497
def load_extension(ext_name, configs = []):
498
"""Load extension by name, then return the module.
500
The extension name may contain arguments as part of the string in the
501
following format: "extname(key1=value1,key2=value2)"
505
# Parse extensions config params (ignore the order)
506
configs = dict(configs)
507
pos = ext_name.find("(") # find the first "("
509
ext_args = ext_name[pos+1:-1]
510
ext_name = ext_name[:pos]
511
pairs = [x.split("=") for x in ext_args.split(",")]
512
configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
514
# Setup the module names
515
ext_module = 'markdown.extensions'
516
module_name_new_style = '.'.join([ext_module, ext_name])
517
module_name_old_style = '_'.join(['mdx', ext_name])
519
# Try loading the extention first from one place, then another
520
try: # New style (markdown.extensons.<extension>)
521
module = __import__(module_name_new_style, {}, {}, [ext_module])
523
try: # Old style (mdx.<extension>)
524
module = __import__(module_name_old_style)
526
message(WARN, "Failed loading extension '%s' from '%s' or '%s'"
527
% (ext_name, module_name_new_style, module_name_old_style))
528
# Return None so we don't try to initiate none-existant extension
531
# If the module is loaded successfully, we expect it to define a
532
# function called makeExtension()
534
return module.makeExtension(configs.items())
535
except AttributeError:
536
message(CRITICAL, "Failed to initiate extension '%s'" % ext_name)
539
def load_extensions(ext_names):
540
"""Loads multiple extensions"""
542
for ext_name in ext_names:
543
extension = load_extension(ext_name)
545
extensions.append(extension)
551
=============================================================================
553
Those are the two functions we really mean to export: markdown() and
560
output_format = DEFAULT_OUTPUT_FORMAT):
561
"""Convert a markdown string to HTML and return HTML as a unicode string.
563
This is a shortcut function for `Markdown` class to cover the most
564
basic use case. It initializes an instance of Markdown, loads the
565
necessary extensions and runs the parser on the given text.
569
* text: Markdown formatted text as Unicode or ASCII string.
570
* extensions: A list of extensions or extension names (may contain config args).
571
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
572
* output_format: Format of output. Supported formats are:
573
* "xhtml1": Outputs XHTML 1.x. Default.
574
* "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
575
* "html4": Outputs HTML 4
576
* "html": Outputs latest supported version of HTML (currently HTML 4).
577
Note that it is suggested that the more specific formats ("xhtml1"
578
and "html4") be used as "xhtml" or "html" may change in the future
579
if it makes sense at that time.
581
Returns: An HTML document as a string.
584
md = Markdown(extensions=load_extensions(extensions),
586
output_format=output_format)
587
return md.convert(text)
590
def markdownFromFile(input = None,
595
output_format = DEFAULT_OUTPUT_FORMAT):
596
"""Read markdown code from a file and write it to a file or a stream."""
597
md = Markdown(extensions=load_extensions(extensions),
599
output_format=output_format)
600
md.convertFile(input, output, encoding)