1
"""
2
Python Markdown
3
===============
4
5
Python Markdown converts Markdown to HTML and can be used as a library or
6
called from the command line.
7
8
## Basic usage as a module:
9
10
    import markdown
11
    html = markdown.markdown(your_text_string)
12
13
See <http://www.freewisdom.org/projects/python-markdown/> for more
14
information and instructions on how to extend the functionality of
15
Python Markdown.  Read that before you try modifying this file.
16
17
## Authors and License
18
19
Started by [Manfred Stienstra](http://www.dwerg.net/).  Continued and
20
maintained  by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
21
Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
22
23
Contact: markdown@freewisdom.org
24
25
Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
26
Copyright 200? Django Software Foundation (OrderedDict implementation)
27
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
28
Copyright 2004 Manfred Stienstra (the original version)
29
30
License: BSD (see LICENSE for details).
31
"""
32
33
version = "2.1.0"
34
version_info = (2,1,0, "Dev")
35
36
import re
37
import codecs
38
from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
39
from md_logging import message
40
import util
41
from preprocessors import build_preprocessors
42
from blockprocessors import build_block_parser
43
from treeprocessors import build_treeprocessors
44
from inlinepatterns import build_inlinepatterns
45
from postprocessors import build_postprocessors
46
from extensions import Extension
47
import html4
48
49
# For backwards compatibility in the 2.0.x series
50
# The things defined in these modules started off in __init__.py so third
51
# party code might need to access them here.
52
from util import *
53
54
55
class Markdown:
56
    """Convert Markdown to HTML."""
57
58
    doc_tag = "div"     # Element used to wrap document - later removed
59
    
60
    option_defaults = {
61
        'html_replacement_text' : '[HTML_REMOVED]',
62
        'tab_length'            : 4,
63
        'enable_attributes'     : True,
64
        'smart_emphasis'        : True,
65
    }
66
    
67
    output_formats = {
68
        'html'  : html4.to_html_string,
69
        'html4' : html4.to_html_string,
70
        'xhtml' : util.etree.tostring,
71
        'xhtml1': util.etree.tostring,
72
    }
73
74
    def __init__(self, extensions=[], **kwargs):
75
        """
76
        Creates a new Markdown instance.
77
78
        Keyword arguments:
79
80
        * extensions: A list of extensions.
81
           If they are of type string, the module mdx_name.py will be loaded.
82
           If they are a subclass of markdown.Extension, they will be used
83
           as-is.
84
        * extension-configs: Configuration settingis for extensions.
85
        * output_format: Format of output. Supported formats are:
86
            * "xhtml1": Outputs XHTML 1.x. Default.
87
            * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
88
            * "html4": Outputs HTML 4
89
            * "html": Outputs latest supported version of HTML (currently HTML 4).
90
            Note that it is suggested that the more specific formats ("xhtml1"
91
            and "html4") be used as "xhtml" or "html" may change in the future
92
            if it makes sense at that time.
93
        * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
94
        * html_replacement_text: Text used when safe_mode is set to "replace".
95
        * tab_length: Length of tabs in the source. Default: 4
96
        * enable_attributes: Enable the conversion of attributes. Default: True
97
        * smart_emphsasis: Treat `_connected_words_` intelegently Default: True
98
99
        """
100
101
        for option, default in self.option_defaults.items():
102
            setattr(self, option, kwargs.get(option, default)) 
103
104
        self.safeMode = kwargs.get('safe_mode', False)
105
        self.registeredExtensions = []
106
        self.docType = ""
107
        self.stripTopLevelTags = True
108
109
        self.build_parser()
110
111
        self.references = {}
112
        self.htmlStash = util.HtmlStash()
113
        self.registerExtensions(extensions = extensions,
114
                                configs = kwargs.get('extension_configs', {}))
115
        self.set_output_format(kwargs.get('output_format', 'xhtml1'))
116
        self.reset()
117
118
    def build_parser(self):
119
        """ Build the parser from the various parts. """
120
        self.preprocessors = build_preprocessors(self)
121
        self.parser = build_block_parser(self) 
122
        self.inlinePatterns = build_inlinepatterns(self)
123
        self.treeprocessors = build_treeprocessors(self)
124
        self.postprocessors = build_postprocessors(self)
125
126
    def registerExtensions(self, extensions, configs):
127
        """
128
        Register extensions with this instance of Markdown.
129
130
        Keyword aurguments:
131
132
        * extensions: A list of extensions, which can either
133
           be strings or objects.  See the docstring on Markdown.
134
        * configs: A dictionary mapping module names to config options.
135
136
        """
137
        for ext in extensions:
138
            if isinstance(ext, basestring):
139
                ext = self.build_extension(ext, configs.get(ext, []))
140
            if isinstance(ext, Extension):
141
                try:
142
                    ext.extendMarkdown(self, globals())
143
                except NotImplementedError, e:
144
                    message(ERROR, e)
145
            else:
146
                message(ERROR,
147
                'Extension "%s.%s" must be of type: "markdown.Extension".' \
148
                    % (ext.__class__.__module__, ext.__class__.__name__))
149
150
    def build_extension(self, ext_name, configs = []):
151
        """Build extension by name, then return the module.
152
153
        The extension name may contain arguments as part of the string in the
154
        following format: "extname(key1=value1,key2=value2)"
155
156
        """
157
158
        # Parse extensions config params (ignore the order)
159
        configs = dict(configs)
160
        pos = ext_name.find("(") # find the first "("
161
        if pos > 0:
162
            ext_args = ext_name[pos+1:-1]
163
            ext_name = ext_name[:pos]
164
            pairs = [x.split("=") for x in ext_args.split(",")]
165
            configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
166
167
        # Setup the module names
168
        ext_module = 'markdown.extensions'
169
        module_name_new_style = '.'.join([ext_module, ext_name])
170
        module_name_old_style = '_'.join(['mdx', ext_name])
171
172
        # Try loading the extention first from one place, then another
173
        try: # New style (markdown.extensons.<extension>)
174
            module = __import__(module_name_new_style, {}, {}, [ext_module])
175
        except ImportError:
176
            try: # Old style (mdx_<extension>)
177
                module = __import__(module_name_old_style)
178
            except ImportError:
179
               message(WARN, "Failed loading extension '%s' from '%s' or '%s'"
180
                   % (ext_name, module_name_new_style, module_name_old_style))
181
               # Return None so we don't try to initiate none-existant extension
182
               return None
183
184
        # If the module is loaded successfully, we expect it to define a
185
        # function called makeExtension()
186
        try:
187
            return module.makeExtension(configs.items())
188
        except AttributeError, e:
189
            message(CRITICAL, "Failed to initiate extension '%s': %s" % (ext_name, e))
190
    
191
    def registerExtension(self, extension):
192
        """ This gets called by the extension """
193
        self.registeredExtensions.append(extension)
194
195
    def reset(self):
196
        """
197
        Resets all state variables so that we can start with a new text.
198
        """
199
        self.htmlStash.reset()
200
        self.references.clear()
201
202
        for extension in self.registeredExtensions:
203
            if hasattr(extension, 'reset'):
204
                extension.reset()
205
206
    def set_output_format(self, format):
207
        """ Set the output format for the class instance. """
208
        try:
209
            self.serializer = self.output_formats[format.lower()]
210
        except KeyError:
211
            message(CRITICAL,
212
                    'Invalid Output Format: "%s". Use one of %s.' \
213
                               % (format, self.output_formats.keys()))
214
215
    def convert(self, source):
216
        """
217
        Convert markdown to serialized XHTML or HTML.
218
219
        Keyword arguments:
220
221
        * source: Source text as a Unicode string.
222
223
        Markdown processing takes place in five steps:
224
225
        1. A bunch of "preprocessors" munge the input text.
226
        2. BlockParser() parses the high-level structural elements of the
227
           pre-processed text into an ElementTree.
228
        3. A bunch of "treeprocessors" are run against the ElementTree. One 
229
           such treeprocessor runs InlinePatterns against the ElementTree, 
230
           detecting inline markup.
231
        4. Some post-processors are run against the text after the ElementTree 
232
           has been serialized into text.
233
        5. The output is written to a string.
234
235
        """
236
237
        # Fixup the source text
238
        if not source.strip():
239
            return u""  # a blank unicode string
240
        try:
241
            source = unicode(source)
242
        except UnicodeDecodeError:
243
            message(CRITICAL,
244
                    'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
245
            return u""
246
247
        source = source.replace(util.STX, "").replace(util.ETX, "")
248
        source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
249
        source = re.sub(r'\n\s+\n', '\n\n', source)
250
        source = source.expandtabs(self.tab_length)
251
252
        # Split into lines and run the line preprocessors.
253
        self.lines = source.split("\n")
254
        for prep in self.preprocessors.values():
255
            self.lines = prep.run(self.lines)
256
257
        # Parse the high-level elements.
258
        root = self.parser.parseDocument(self.lines).getroot()
259
260
        # Run the tree-processors
261
        for treeprocessor in self.treeprocessors.values():
262
            newRoot = treeprocessor.run(root)
263
            if newRoot:
264
                root = newRoot
265
266
        # Serialize _properly_.  Strip top-level tags.
267
        output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf-8"))
268
        if self.stripTopLevelTags:
269
            try:
270
                start = output.index('<%s>'%self.doc_tag)+len(self.doc_tag)+2
271
                end = output.rindex('</%s>'%self.doc_tag)
272
                output = output[start:end].strip()
273
            except ValueError:
274
                if output.strip().endswith('<%s />'%self.doc_tag):
275
                    # We have an empty document
276
                    output = ''
277
                else:
278
                    # We have a serious problem
279
                    message(CRITICAL, 'Failed to strip top level tags.')
280
281
        # Run the text post-processors
282
        for pp in self.postprocessors.values():
283
            output = pp.run(output)
284
285
        return output.strip()
286
287
    def convertFile(self, input=None, output=None, encoding=None):
288
        """Converts a markdown file and returns the HTML as a unicode string.
289
290
        Decodes the file using the provided encoding (defaults to utf-8),
291
        passes the file content to markdown, and outputs the html to either
292
        the provided stream or the file with provided name, using the same
293
        encoding as the source file.
294
295
        **Note:** This is the only place that decoding and encoding of unicode
296
        takes place in Python-Markdown.  (All other code is unicode-in /
297
        unicode-out.)
298
299
        Keyword arguments:
300
301
        * input: File object or path of file as string.
302
        * output: Name of output file. Writes to stdout if `None`.
303
        * encoding: Encoding of input and output files. Defaults to utf-8.
304
305
        """
306
307
        encoding = encoding or "utf-8"
308
309
        # Read the source
310
        if isinstance(input, basestring):
311
            input_file = codecs.open(input, mode="r", encoding=encoding)
312
        else:
313
            input_file = input
314
        text = input_file.read()
315
        input_file.close()
316
        text = text.lstrip(u'\ufeff') # remove the byte-order mark
317
318
        # Convert
319
        html = self.convert(text)
320
321
        # Write to file or stdout
322
        if isinstance(output, (str, unicode)):
323
            output_file = codecs.open(output, "w", encoding=encoding)
324
            output_file.write(html)
325
            output_file.close()
326
        else:
327
            output.write(html.encode(encoding))
328
329
330
"""
331
EXPORTED FUNCTIONS
332
=============================================================================
333
334
Those are the two functions we really mean to export: markdown() and
335
markdownFromFile().
336
"""
337
338
def markdown(text, *args, **kwargs):
339
    """Convert a markdown string to HTML and return HTML as a unicode string.
340
341
    This is a shortcut function for `Markdown` class to cover the most
342
    basic use case.  It initializes an instance of Markdown, loads the
343
    necessary extensions and runs the parser on the given text.
344
345
    Keyword arguments:
346
347
    * text: Markdown formatted text as Unicode or ASCII string.
348
    * extensions: A list of extensions or extension names (may contain config args).
349
    * safe_mode: Disallow raw html.  One of "remove", "replace" or "escape".
350
    * output_format: Format of output. Supported formats are:
351
        * "xhtml1": Outputs XHTML 1.x. Default.
352
        * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
353
        * "html4": Outputs HTML 4
354
        * "html": Outputs latest supported version of HTML (currently HTML 4).
355
        Note that it is suggested that the more specific formats ("xhtml1"
356
        and "html4") be used as "xhtml" or "html" may change in the future
357
        if it makes sense at that time.
358
359
    Returns: An HTML document as a string.
360
361
    """
362
    md = Markdown(*args, **kwargs)
363
    return md.convert(text)
364
365
366
def markdownFromFile(input = None,
367
                     output = None,
368
                     extensions = [],
369
                     encoding = None,
370
                     *args, **kwargs):
371
    """Read markdown code from a file and write it to a file or a stream."""
372
    md = Markdown(extensions=extensions, *args, **kwargs)
373
    md.convertFile(input, output, encoding)