~widelands-dev/widelands-website/trunk

« back to all changes in this revision

Viewing changes to pybb/util.py

updated beautifulsoup to get version compatible with python3

Show diffs side-by-side

added added

removed removed

Lines of Context:
2
2
import random
3
3
import traceback
4
4
import json
 
5
import re
5
6
 
6
 
from BeautifulSoup import BeautifulSoup
 
7
from bs4 import BeautifulSoup, NavigableString
7
8
from datetime import datetime
8
9
from django.shortcuts import render
9
10
from django.http import HttpResponse
11
12
from django.utils.translation import check_for_language
12
13
from django.utils.encoding import force_unicode
13
14
from django import forms
14
 
from django.template.defaultfilters import urlize as django_urlize
15
15
from django.core.paginator import Paginator, EmptyPage, InvalidPage
16
16
from django.conf import settings
17
17
from pybb import settings as pybb_settings
145
145
    return form
146
146
 
147
147
 
 
148
PLAIN_LINK_RE = re.compile(r'(http[s]?:\/\/[-a-zA-Z0-9@:%._\+~#=/?]+)')
 
149
def exclude_code_tag(bs4_string):
 
150
    if bs4_string.parent.name == 'code':
 
151
        return False
 
152
    m = PLAIN_LINK_RE.search(bs4_string)
 
153
    if m:
 
154
        return True
 
155
    return False
 
156
 
 
157
 
148
158
def urlize(data):
149
159
    """Urlize plain text links in the HTML contents.
150
160
 
152
162
 
153
163
    """
154
164
 
155
 
    soup = BeautifulSoup(data)
156
 
    for chunk in soup.findAll(text=True):
157
 
        islink = False
158
 
        ptr = chunk.parent
159
 
        while ptr.parent:
160
 
            if ptr.name == 'a' or ptr.name == 'code':
161
 
                islink = True
162
 
                break
163
 
            ptr = ptr.parent
164
 
        if not islink:
165
 
            # Using unescape to prevent conversation of f.e. > to >
166
 
            chunk = chunk.replaceWith(django_urlize(unicode(unescape(chunk))))
 
165
    soup = BeautifulSoup(data, 'lxml')
 
166
    for found_string in soup.find_all(string=exclude_code_tag):
 
167
        new_content = []
 
168
        strings_or_tags = found_string.parent.contents
 
169
        for string_or_tag in strings_or_tags:
 
170
            try:
 
171
                for string in PLAIN_LINK_RE.split(string_or_tag):
 
172
                    if string.startswith('http'):
 
173
                        # Apply an a-Tag
 
174
                        tag = soup.new_tag('a')
 
175
                        tag['href'] = string
 
176
                        tag.string = string
 
177
                        tag['nofollow'] = 'true'
 
178
                        new_content.append(tag)
 
179
                    else:
 
180
                        # This is just a string, apply a bs4-string
 
181
                        new_content.append(NavigableString(string))
 
182
            except:
 
183
                # Regex failed, so apply what ever it is
 
184
                new_content.append(string_or_tag)
 
185
 
 
186
        # Apply the new content
 
187
        found_string.parent.contents = new_content
167
188
 
168
189
    return unicode(soup)
169
190