changeset 237:a40126ebcb98

Savane-specific markup: direct convertion from PHP code (first draft, untested)
author Sylvain Beucler <beuc@beuc.net>
date Sun, 08 Aug 2010 01:17:11 +0200
parents 0b272ed5ffd1
children e867d41c8697
files savane/utils/markup.py
diffstat 1 files changed, 626 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
new file mode 100644
--- /dev/null
+++ b/savane/utils/markup.py
@@ -0,0 +1,626 @@
+# Savane3 custom markup system
+# Copyright (C)  2005, 2006  Tobias Toedter
+# Copyright (C)  2005, 2006  Mathieu Roy
+# Copyright (C)  2010  Sylvain Beucler
+# 
+# This file is part of Savane.
+#
+# Savane is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Savane is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+## Provides functions to allow users to format the text in a secure way:
+##    markup_basic() for very light formatting
+##    markup_rich() for formatting excepting headers
+##    markup_full() for full formatting, including headers
+
+from django.conf import settings
+from xml.sax.saxutils import quoteattr
+import re
+
+def markup_info(level):
+    """
+    Will tell the user what is the level of markup available in a
+    uniformized way.
+    Takes as argument the level, being full / rich / basic / none
+    To avoid making page looking strange, we will put that only on textarea
+    where it is supposed to be the most useful
+    """
+    if level == 'basic':
+        string = _("Basic markup")
+        text = _("Only basic text tags are available in this input field.")
+    elif level == 'rich':
+        string = _("Rich markup")
+        text = _("Rich and basic text tags are available in this input field.")      
+    elif level == 'full':
+        string = _("Full markup")
+        text = _("Every tags are available in this input field.")      
+    elif level == 'none':
+        string = _("No markup")
+        text = _("No tags are available in this input field.")    
+
+    if level != 'none':
+        text = text + " " + _("Check the markup reminder in related documentation for a description of these tags.")
+
+    return '<span class="help" title=' + quoteattr(text) \
+        + '><img src="' + settings.STATIC_MEDIA_URL \
+        + 'images/savane/common/misc.default/edit.png' \
+        + ' border="0" class="icon" alt="" />' \
+        + string + '</span>'
+
+def markup_basic(text):
+    """
+    Converts special markup characters in the input text to real HTML
+    
+    The following syntax is supported:
+    *word* -> <strong>word</strong>
+    _word_ -> <em>word</em>
+    [http://gna.org/] -> <a href="http://gna.org/">http://gna.org/</a>
+    [http://gna.org/ text] -> <a href="http://gna.org/">text</a>
+    (bug|task|...) #1234 -> Link to corresponding page
+    """
+    lines = text.split("\n")
+    result = []
+
+    for line in lines:
+        result.append(_markup_inline(line))
+
+    return "\n".join(result)
+
+def markup_rich(text):
+    """
+    Converts special markup characters in the input text to real HTML
+    
+    This function does the same markup as utils_basic_markup(), plus
+    it supports the following:
+    * paragraphs
+    * lists (<ul> and <ol>)
+    * nested lists
+    * horizontal rulers
+    """
+    return markup_full(text, False)
+
+def markup_full(text, allow_headings=True):
+    """
+    Converts special markup characters in the input text to real HTML
+    
+    This function does the same markup as utils_rich_markup(), plus
+    it converts headings to <h3> ... <h6>
+    """
+    lines = text.split("\n")
+    result = []
+    printer = False  # used to be global var in PHP
+
+    # we use a stack (last in, first out) to track the current
+    # context (paragraph, lists) so we can correctly close tags
+    context_stack = []
+
+    quoted_text = False
+    verbatim = False
+
+    for index,line in enumerate(lines):
+        # the verbatim tags are not allowed to be nested, because
+        # they are translated to HTML <textarea> (<pre> in printer mode),
+        # which in turn is also
+        # not allowed to be nested.
+        # therefore, we dont need a counter of the level, but
+        # a simple bool flag
+        # We also need to bufferize the verbatim content, as we want to now
+        # its exact number of lines
+        #
+        # yeupou, 2006-10-31: we need a verbatim count, because actually 
+        # we may want to put at least one verbatim block into another, for
+        # instance in the recipe that explain the verbatim tag
+        if re.match('[+]verbatim[+]') and not verbatim:
+            verbatim = 1
+            verbatim_buffer = ''
+            verbatim_buffer_linecount = 0
+
+            line = "\n".join(context_stack)
+
+            if not printer:
+                context_stack.insert(0, '</textarea>')
+            else:
+                context_stack.insert(0, '</pre')
+
+            # Jump to the next line, assuming that we can ignore the rest of the
+            # line
+            continue
+
+        # Increment the verbatim count if we find a verbatim closing in a 
+        # verbatim environment
+        if re.match('[+]verbatim[+]', line) and not verbatim:
+            verbatim += 1
+
+        if re.match('-verbatim-', line) and verbatim == 1:
+            verbatim = False
+
+            line = "\n".join(context_stack)
+            context_stack.pop(0)
+
+            #array_pop($result); # no longer useful since we bufferize verbatim
+            if not printer:
+                # Limit the textarea to 20 lines
+                if verbatim_buffer_linecount > 20:
+                    verbatim_buffer_linecount = 20
+
+                # Use a text input if it is not multiline
+                if verbatim_buffer_linecount < 2:
+                    result.append('<input type="text" class="verbatim" readonly="readonly" size="60" value="' \
+                                      + verbatim_buffer \
+                                      + '" />')
+                else:
+                    result.append('<textarea class="verbatim" readonly="readonly" rows="' \
+                                      + verbatim_buffer_linecount + '" cols="80">' \
+                                      + verbatim_buffer + '</textarea>')
+            else:
+                result.append('<pre class="verbatim">' + verbatim_buffer + '</pre>')
+            verbatim_buffer = ''
+            verbatim_buffer_linecount = 0
+	  
+            # Jump to the next line, assuming that we can ignore the
+            # rest of the line
+            continue
+
+        # Decrement the verbatim count if we find a verbatim closing in a 
+        # verbatim environment
+        if re.search('-verbatim-', line) and verbatim > 1:
+            verbatim -= 1
+
+        # if we're in the verbatim markup, don't apply the markup
+        if verbatim:
+            # disable the +nomarkup+ tags by inserting a unique string.
+            # this has to be done in the original string, because that
+            # is the one which will be split upon the +nomarkup+ tags,
+            # see below
+            escaped_line = line.replace('nomarkup', 'no-1a4f67a7-4eae-4aa1-a2ef-eecd8af6a997-markup')
+            lines[index] = escaped_line;
+            verbatim_buffer += escaped_line;
+            verbatim_buffer_linecount += 1
+        else:
+            # Otherwise, normal run, do the markup
+            result.append(_full_markup(line, allow_headings, context_stack, quoted_text))
+
+    # make sure that all previously used contexts get their
+    # proper closing tag by merging in the last closing tags
+    markup_text = "\n".join(result + context_stack)
+
+    # its easiest to markup everything, without supporting the nomarkup
+    # tag. afterwards, we replace every nomarkup tag pair with the content
+    # between those tags in the original string
+    original = re.split('([+-]nomarkup[+-])', "\n".join(lines))
+    markup = re.split('([+-]nomarkup[+-])', markup_text)
+    # save the HTML tags from the last element in the markup array, see below
+    last_tags = markup[len(markup)-1]
+    nomarkup_level = 0
+
+    for index,original_text in original:
+        # keep track of nomarkup tags
+        if original_text == '+nomarkup+': nomarkup_level += 1
+        if original_text == '-nomarkup-': nomarkup_level -= 1
+
+        # if the current match is the nomarkup tag, we don't want it to
+        # show up in the markup text -> set it to an empty string
+        if re.search('[+-]nomarkup[+-]', original_text):
+            markup[index] = ''
+            original_text = ''
+
+        # while we're in a nomarkup environment, the already marked up text
+        # needs to be replaced with the original content. Also, we need
+        # to add <br />  tags for newlines.
+        if nomarkup_level > 0:
+            markup[index] = original_text.replace('\n','<br />\n')
+
+        # normally, $nomarkup_level must be zero at this point. however, if
+        # the user submits wrong markup and forgets to close the -nomarkup-
+        # tag, we need to take care of that.
+        # To do this, we need to look for closing tags which have been deleted.
+        if nomarkup_level > 0:
+            trailing_markup = array_reverse(last_tags.split('\n'))
+        restored_tags = ''
+        for tag in trailing_markup:
+            if re.match('^\s*<\/[a-z]+>$', tag):
+                restored_tags = "\n" + tag + restored_tags
+            else:
+                markup.append(restored_tags)
+                break
+
+    # lastly, revert the escaping of +nomarkup+ tags done above
+    # for verbatim environments
+    return ''.join(markup).replace('no-1a4f67a7-4eae-4aa1-a2ef-eecd8af6a997-markup', 'nomarkup')
+
+def markup_textoutput(text):
+    """
+    Convert whatever content that can contain markup to a valid text output
+    It wont touch what seems to be valid in text already, or what cannot
+    be converted in a very satisfactory way.
+    This function should be minimal, just to avoid weird things, not to do
+    very fancy things.
+    """
+
+    lines = text.split("\n")
+    result = []
+
+    protocols = "https?|ftp|sftp|file|afs|nfs"
+    savane_tags = "verbatim|nomarkup"
+
+    for line in lines:
+        # Handle named hyperlink.
+        line = re.sub(
+              # find the opening brace '['
+		     '\['
+              # followed by the protocol, either http:// or https://
+		     + '((' + protocols + '):\/\/'
+              # match any character except whitespace or the closing
+              # brace ']' for the actual link
+		     + '[^\s\]]+)'
+              # followed by at least one whitespace
+		     + '\s+'
+              # followed by any character (non-greedy) and the
+              # next closing brace ']'
+		     + '(.+?)\]', '\\3 <\\1>', line)
+      
+        # Remove savane-specific tags
+        line = re.sub('\+' + savane_tags + '\+', '', line)
+        line = re.sub('-' + savane_tags + '-/', '', line)
+        result.append(line)
+
+    return "\n".join(result)
+
+def _full_markup(line, allow_headings, context_stack, quoted_text):
+    """
+    Internal function for recognizing and formatting special markup
+    characters in the input line to real HTML
+
+    This function is a helper for utils_full_markup() and should
+    not be used otherwise.
+
+    'context_stack' and 'quoted_text' -> passed by ref.
+    """
+
+    #############################################################
+    # context formatting
+    #
+    # the code below marks up recognized special characters,
+    # by starting a new context (e.g. headings and lists)
+    #############################################################
+
+    # generally, we want to start a new paragraph. this will be set
+    # to false, if a new paragraph is no longer appropriate, like
+    # for headings or lists
+    start_paragraph = True
+
+    # Match the headings, e.g. === heading ===
+    if allow_headings:
+        line = _markup_headings(line, context_stack, start_paragraph)
+
+    # Match list items
+    line = _markup_lists(line, context_stack, start_paragraph)
+
+    # replace four '-' sign with a horizontal ruler
+    if re.match('^----\s*$', line):
+        line = "\n".join(context_stack) + '<hr />'
+        context_stack = []
+        start_paragraph = False
+
+    ############################################################
+    # inline formatting
+    #
+    # the code below marks up recognized special characters,
+    # without starting a new context (e.g. <strong> and <em>)
+    #############################################################
+
+    line = _markup_inline(line)
+
+    #############################################################
+    # paragraph formatting
+    #
+    # the code below is responsible for doing the Right Thing(tm)
+    # by either starting a new paragraph and closing any previous
+    # context or continuing an existing paragraph
+    #############################################################
+
+    # change the quoteing mode when the line start with '>'
+    if line[0:4] == '&gt;':
+        # if the previous line was not quoted, start a new quote paragraph
+        if not quoted_text:
+            line = "\n".join(context_stack) + "<p class=\"quote\">" + line
+            # empty the stack
+            context_stack = array('</p>')
+            start_paragraph = False
+        quoted_text = True
+    else:
+        # if the previous line was quoted, end the quote paragraph
+        if quoted_text and start_paragraph and line != '':
+            line = "\n".join(context_stack) + "\n<p>" + line
+            # empty the stack
+            context_stack = ['</p>']
+        quoted_text = False
+
+    # don't start a new paragraph again, if we already did that
+    if len(context_stack) > 0 and context_stack[0] == '</p>':
+        start_paragraph = False
+
+    # add proper closing tags when we encounter an empty line.
+    # note that there might be no closing tags, in this case
+    # the line will remain emtpy.
+    if re.match('^(|\s*)$', line):
+        line = "\n".join(context_stack) + line
+        # empty the stack
+        context_stack = []
+        start_paragraph = false
+
+    # Finally start a new paragraph if appropriate
+    if start_paragraph:
+        # make sure that all previously used contexts get their
+        # proper closing tag
+        line = "\n".join(context_stack) + "<p>" + line
+        # empty the stack
+        context_stack = ['</p>']
+
+    # append a linebreak while in paragraph mode
+    if len(context_stack) > 0 and context_stack[0] == '</p>':
+        line += '<br />'
+
+    return line
+
+def _markup_headings(line, context_stack, start_paragraph):
+    """
+    Internal function for recognizing and formatting headings
+    
+    This function is a helper for _full_markup() and should
+    not be used otherwise.
+
+    'context_stack' and 'start_paragraph' -> passed by ref.
+    """
+    matches = re.search(
+        # find one to four '=' signs at the start of a line
+        '^(={1,4})'
+        # followed by exactly one space
+        + ' '
+        # followed by any character
+        + '(.+)'
+        # followed by exactly one space
+        + ' '
+        # followed by one to four '=' signs at the end of a line (whitespace allowed)
+        + '(={1,4})\s*$', line)
+    if matches:
+        header_level_start = max(min(len(matches.group(1)), 4), 1)
+        header_level_end = len(matches.group(3))
+        if header_level_start == header_level_end:
+            # if the user types '= heading =' (one '=' sign), it will
+            # actually be rendered as a level 3 heading <h3>
+            header_level_start += 2
+            header_level_end += 2
+
+            line = "<h" + header_level_start + ">" + matches.group(2) + "</h" + header_level_end + ">"
+            # make sure that all previously used contexts get their
+            # proper closing tag
+            line = "\n".join(context_stack) + line
+            # empty the stack
+            context_stack = []
+            start_paragraph = False
+    return line
+
+def _markup_lists(line, context_stack, start_paragraph):
+    """
+    Internal function for recognizing and formatting lists
+    
+    This function is a helper for _full_markup() and should
+    not be used otherwise.
+
+    'context_stack' and 'start_paragraph' -> passed by ref.
+    """
+    matches = re.search('^\s?([*0]+) (.+)$', line)
+    if matches:
+        # determine the list level currently in use
+        current_list_level = 0
+        for context in context_stack:
+            if context == '</ul>' or context == '</ol>':
+                current_list_level += 1
+
+    # determine whether the user list levels match the list
+    # level we have in our context stack
+    #
+    # this will catch (potential) errors of the following form:
+    # * list start
+    # 0 maybe wrong list character
+    # * list end
+    markup_position = 0
+    for context in context_stack[::-1]:
+        # we only care for the list types
+        if context != '</ul>' and context != '</ol>':
+            continue
+
+        markup_character = matches.group(1)[markup_position, markup_position+1]
+
+        if ((markup_character == '*' and context != '</ul>')
+            or (markup_character == '0' and context != '</ol>')):
+            # force a new and clean list start
+            current_list_level = 0
+            break
+        else:
+            markup_position += 1
+
+        # if we are not in a list, close the previous context
+        line = ''
+        if current_list_level == 0:
+            line = "\n".join(context_stack)
+            context_stack = []
+
+        # determine the list level the user wanted
+        wanted_list_level = len(matches.group(1))
+
+        # here we start a new list and make sure that the markup
+        # is valid, even if the user did skip one or more list levels
+        list_level_counter = current_list_level
+        while list_level_counter < wanted_list_level:
+            test = matches.group(1)[list_level_counter, list_level_counter+1]
+            if test == '*':
+                tag = 'ul'
+            elif test == '0':
+                tag = 'ol'
+            line += "<" + tag + ">\n<li>"
+            context_stack.insert(0, "</"+ tag + ">")
+            context_stack.insert(0, "</li>")
+            list_level_counter += 1
+
+        # here we end a previous list and make sure that the markup
+        # is valid, even if the user did skip one or more list levels
+        list_level_counter = current_list_level
+        while list_level_counter > wanted_list_level:
+            line += context_stack.pop(0) + "\n" \
+                + context_stack.pop(0) + "\n"
+            list_level_counter -= 1
+
+        # prepare the next item of the same list level
+        if current_list_level >= wanted_list_level:
+            line += "</li>\n<li>"
+
+        # finally, append the list item
+        line += matches.group(2)
+        start_paragraph = False
+    return line
+
+def _markup_inline(line):
+    """
+    Internal function for recognizing and formatting inline tags and links
+    
+    This function is a helper for _full_markup() and should not be
+    used otherwise.
+    """
+    if len(line) == 0:
+        return
+
+    # Regexp of protocols supported in hyperlinks (should be protocols that
+    # we can expect web browsers to support)
+    protocols = "https?|ftp|sftp|file|afs|nfs"
+
+
+    # Prepare usual links: prefix every "www." with "http://"
+    # unless there is a // before
+    line = re.sub('(^|\s|[^\/])(www\.)', '\\1http://\\2', line, re.I);
+
+    # replace the @ sign with an HTML entity, if it is used within
+    # an url (e.g. for pointers to mailing lists). This way, the
+    # @ sign doesn't get mangled in the e-mail markup code
+    # below. See bug #2689 on http://gna.org/ for reference.
+    line = re.sub("([a-z]+://[^<>[:space:]]+)@", "\\1&#64;", line, re.I)
+
+    # Prepare the markup for normal links, e.g. http://test.org, by
+    # surrounding them with braces []
+    # (& = begin of html entities, it means a end of string unless
+    # it is &amp; which itself is the entity for &)
+    line = re.sub('(^|\s|[^\[])((' + protocols + '):\/\/(&amp;|[^\s&]+[a-z0-9\/^])+)',
+                  '\\1[\\2]', line, re.I)
+
+    # do a markup for mail links, e.g. info@support.org
+    # (do not use utils_emails, this does extensive database
+    # search on the string
+    # and replace addresses in several fashion. Here we just want to make
+    # a link). Make sure that 'cvs -d:pserver:anonymous@cvs.sv.gnu.org:/...'
+    # is NOT replaced.
+    line = re.subpreg_replace("(^|\s)([a-z0-9_+-.]+@([a-z0-9_+-]+\.)+[a-z]+)(\s|$)",
+                              '\\1' + '\2' + '\\4', line, re.I)
+
+    # Links between items
+    # FIXME: it should be i18n, but in a clever way, meaning that everytime
+    # a form is submitted with such string, the string get converted in
+    # english so we always get the links found without having a regexp
+    # including every possible language.
+    trackers = {
+        "bugs?" : "bugs/?",
+        "support|sr" : "support/?",
+        "tasks?" : "task/?",
+        "patch" : "patch/?",
+        # In this case, we make the link pointing to support, it wont matter,
+        # the download page is in every tracker and does not check if the tracker
+        # is actually used
+        "files?" : "support/download.php?file_id=",
+        }
+    for regexp,link in trackers:
+        # Allows only two white space between the string and the numeric id
+        # to avoid having too time consuming regexp. People just have to pay
+        # attention.
+        line = re.sub("(^|\s|\W)($regexp)\s{0,2}#([0-9]+)",
+                      '\1<em><a href="' + 'TODO:sys_home'
+                      + link + '\\3">\\2&nbsp;#\\3</a></em>',
+                      line, re.I)
+
+    # add an internal link for comments
+    line = re.sub('(comments?)\s{0,2}#([0-9]+)',
+                  '<em><a href="#comment\\2">\\1&nbsp;#\\2</a></em>',
+                  line, re.I)
+
+    # Add support for named hyperlinks, e.g.
+    # [http://gna.org/ Text] -> <a href="http://gna.org/">Text</a>
+    line = re.sub(
+        # find the opening brace '['
+        '\['
+        # followed by the protocol, either http:// or https://
+        + '((' + protocols + '):\/\/'
+        # match any character except whitespace or the closing
+        # brace ']' for the actual link
+        + '[^\s\]]+)'
+        # followed by at least one whitespace
+        + '\s+'
+        # followed by any character (non-greedy) and the
+        # next closing brace ']'
+        + '(.+?)\]',
+        '<a href="\\1">\\3</a>', line)
+
+    # Add support for unnamed hyperlinks, e.g.
+    # [http://gna.org/] -> <a href="http://gna.org/">http://gna.org/</a> 
+    line = re.sub(
+        # find the opening brace '['
+        '\['
+        # followed by the protocol, either http:// or https://
+        # (FIXME: which protocol does it makes sense to support, which one
+        # should we ignore?)
+        + '((' + protocols + '):\/\/'
+        # match any character except whitespace (non-greedy) for
+        # the actual link, followed by the closing brace ']'
+        + '[^\s]+?)\]/e',
+        "\\1", line)
+
+    # *word* -> <strong>word</strong>
+    line = re.sub(
+        # find an asterisk
+        '\*'
+        # then one character (except a space or asterisk)
+        + '([^* ]'
+        # then (optionally) any character except asterisk
+        + '[^*]*?)'
+        # then an asterisk
+        + '\*',
+        '<strong>\\1</strong>', line)
+
+    # _word_ -> <em>word</em>
+    line = re.sub(
+        # allow for the pattern to start at the beginning of a line.
+        # if it doesn't start there, the character before the slash
+        # must be either whitespace or the closing brace '>', to
+        # allow for nested html tags (e.g. <p>_markup_</p>).
+        # Additionally, the opening brace may appear.
+        # See bug #10571 on http://gna.org/ for reference.
+        '(^|\s+|>|\()'
+        # match the underscore
+        + '_'
+        # match any character (non-greedy)
+        + '(.+?)'
+        # match the ending underscore and either end of line or
+        # a non-word character
+        + '_(\W|$)/',
+        '\\1<em>\\2</em>\\3',
+        line)
+
+    return line