From 2a82a57b2d39757d360483ba22d089ad7d149deb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 10 Aug 2013 00:21:53 -0400 Subject: [PATCH] Add support for bold and italic text (part one). --- mwparserfromhell/parser/contexts.py | 23 ++++++++---- mwparserfromhell/parser/tokenizer.py | 69 +++++++++++++++++++++++++++++++++--- 2 files changed, 81 insertions(+), 11 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 2785708..2ae3cc3 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -69,6 +69,11 @@ Local (stack-specific) contexts: * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` +* :py:const:`STYLE` + + * :py:const:`STYLE_ITALICS` + * :py:const:`STYLE_BOLD` + * :py:const:`DL_TERM` * :py:const:`SAFETY_CHECK` @@ -117,14 +122,18 @@ TAG_BODY = 1 << 16 TAG_CLOSE = 1 << 17 TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE -DL_TERM = 1 << 18 +STYLE_ITALICS = 1 << 18 +STYLE_BOLD = 1 << 19 +STYLE = STYLE_ITALICS + STYLE_BOLD + +DL_TERM = 1 << 20 -HAS_TEXT = 1 << 19 -FAIL_ON_TEXT = 1 << 20 -FAIL_NEXT = 1 << 21 -FAIL_ON_LBRACE = 1 << 22 -FAIL_ON_RBRACE = 1 << 23 -FAIL_ON_EQUALS = 1 << 24 +HAS_TEXT = 1 << 21 +FAIL_ON_TEXT = 1 << 22 +FAIL_NEXT = 1 << 23 +FAIL_ON_LBRACE = 1 << 24 +FAIL_ON_RBRACE = 1 << 25 +FAIL_ON_EQUALS = 1 << 26 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index d3ce7bd..650e605 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -57,11 +57,11 @@ class Tokenizer(object): USES_C = False START = object() END = object() - MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", - "/", "-", "\n", END] + MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", + ":", "/", "-", "\n", END] MAX_DEPTH = 40 MAX_CYCLES = 100000 - regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE) + regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) tag_splitter = re.compile(r"([\s\"\\]+)") def __init__(self): @@ -629,6 +629,58 @@ class Tokenizer(object): else: self._emit_all(tag) + def _really_parse_style(self, context): + """Parse wiki-style bold or italics. Raises :py:exc:`BadRoute`.""" + stack = self._parse(context) + markup = "''" if context == contexts.STYLE_ITALICS else "'''" + tag = "i" if context == contexts.STYLE_ITALICS else "b" + + self._emit(tokens.TagOpenOpen(wiki_markup=markup)) + self._emit_text(tag) + self._emit(tokens.TagCloseOpen()) + self._emit_all(stack) + self._emit(tokens.TagOpenClose()) + self._emit_text(tag) + self._emit(tokens.TagCloseClose()) + + def _parse_style(self): + """Parse wiki-style formatting (``''``/``'''`` for italics/bold).""" + self._head += 2 + ticks = 2 + while self._read() == "'": + self._head += 1 + ticks += 1 + reset = self._head + + if ticks > 5: + self._emit_text("'" * (ticks - 5)) + ticks = 5 + elif ticks == 4: + self._emit_text("'") + ticks = 3 + + if ticks == 5: + raise NotImplementedError() + if ticks == 3: + try: + return self._really_parse_style(contexts.STYLE_BOLD) + except BadRoute: + self._emit_text("'") + self._head = reset + try: + self._really_parse_style(contexts.STYLE_ITALICS) + except BadRoute: + self._emit_text("''") + self._head = reset - 1 + + def _handle_style_end(self): + """Handle the end of wiki-style italics or bold (``''`` or ``'''``).""" + self._head += 1 if self._context & contexts.STYLE_ITALICS else 2 + while self._read(1) == "'": + self._emit_text("'") + self._head += 1 + return self._pop() + def _handle_list_marker(self): """Handle a list marker at the head (``#``, ``*``, ``;``, ``:``).""" markup = self._read() @@ -667,7 +719,8 @@ class Tokenizer(object): def _handle_end(self): """Handle the end of the stream of wikitext.""" fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | - contexts.HEADING | contexts.COMMENT | contexts.TAG) + contexts.HEADING | contexts.COMMENT | contexts.TAG | + contexts.STYLE) double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) if self._context & fail: if self._context & contexts.TAG_BODY: @@ -817,6 +870,14 @@ class Tokenizer(object): self._emit_text("<") elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() + elif this == next == "'": + if not self._context & contexts.STYLE and self._can_recurse(): + self._parse_style() + elif (self._context & contexts.STYLE_ITALICS or + self._read(2) == "'" and self._context & contexts.STYLE_BOLD): + return self._handle_style_end() + else: + self._emit_text("'") elif self._read(-1) in ("\n", self.START): if this in ("#", "*", ";", ":"): self._handle_list()