Browse Source

Support HTMLEntities.

tags/v0.1
Ben Kurtovic 11 years ago
parent
commit
010bd34653
2 changed files with 54 additions and 5 deletions
  1. +3
    -0
      mwparserfromhell/parser/builder.py
  2. +51
    -5
      mwparserfromhell/parser/tokenizer.py

+ 3
- 0
mwparserfromhell/parser/builder.py View File

@@ -98,9 +98,12 @@ class Builder(object):
token = self._tokens.pop(0)
if isinstance(token, tokens.HTMLEntityHex):
text = self._tokens.pop(0)
self._tokens.pop(0) # Remove HTMLEntityEnd
return HTMLEntity(text.text, named=False, hexadecimal=True,
hex_char=token.char)
self._tokens.pop(0) # Remove HTMLEntityEnd
return HTMLEntity(token.text, named=False, hexadecimal=False)
self._tokens.pop(0) # Remove HTMLEntityEnd
return HTMLEntity(token.text, named=True, hexadecimal=False)

def _handle_heading(self, token):


+ 51
- 5
mwparserfromhell/parser/tokenizer.py View File

@@ -20,6 +20,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import htmlentitydefs
import string

from . import contexts
from . import tokens

@@ -49,9 +52,8 @@ class Tokenizer(object):
def _context(self, value):
self._stacks[-1][1] = value

def _push(self):
stack, context = [], 0
self._stacks.append([stack, context])
def _push(self, context=0):
self._stacks.append([[], context])

def _pop(self):
return self._stacks.pop()[0]
@@ -137,9 +139,51 @@ class Tokenizer(object):
self._context |= contexts.TEMPLATE_PARAM_VALUE
self._write(tokens.TemplateParamEquals())

def _parse_entity(self):
reset = self._head
self._head += 1
try:
self._push()
self._write(tokens.HTMLEntityStart())
numeric = hexadecimal = False
if self._at_head("#"):
numeric = True
self._write(tokens.HTMLEntityNumeric())
if self._read(1).lower() == "x":
hexadecimal = True
self._write(tokens.HTMLEntityHex(char=self._read(1)))
self._head += 2
else:
self._head += 1
text = []
valid = string.hexdigits if hexadecimal else string.digits
if not numeric and not hexadecimal:
valid += string.ascii_letters
while True:
if self._at_head(";"):
text = "".join(text)
if numeric:
test = int(text, 16) if hexadecimal else int(text)
if test < 1 or test > 0x10FFFF:
raise BadRoute(self._pop())
else:
if text not in htmlentitydefs.entitydefs:
raise BadRoute(self._pop())
self._write(tokens.Text(text=text))
self._write(tokens.HTMLEntityEnd())
break
if self._read() is self.END or self._read() not in valid:
raise BadRoute(self._pop())
text.append(self._read())
self._head += 1
except BadRoute:
self._head = reset
self._write(tokens.Text(text=self._read()))
else:
self._write_all(self._pop())

def _parse_until(self, stop, context=0):
self._push()
self._context = context
self._push(context)
while True:
self._verify_context_pre_stop()
if self._catch_stop(stop):
@@ -151,6 +195,8 @@ class Tokenizer(object):
self._handle_template_param()
elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY:
self._handle_template_param_value()
elif self._at_head("&"):
self._parse_entity()
else:
self._write(tokens.Text(text=self._read()))
self._head += 1


Loading…
Cancel
Save