From 2a82a57b2d39757d360483ba22d089ad7d149deb Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 10 Aug 2013 00:21:53 -0400
Subject: [PATCH] Add support for bold and italic text (part one).

---
 mwparserfromhell/parser/contexts.py  | 23 ++++++++----
 mwparserfromhell/parser/tokenizer.py | 69 +++++++++++++++++++++++++++++++++---
 2 files changed, 81 insertions(+), 11 deletions(-)

diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py
index 2785708..2ae3cc3 100644
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -69,6 +69,11 @@ Local (stack-specific) contexts:
     * :py:const:`TAG_BODY`
     * :py:const:`TAG_CLOSE`
 
+* :py:const:`STYLE`
+
+    * :py:const:`STYLE_ITALICS`
+    * :py:const:`STYLE_BOLD`
+
 * :py:const:`DL_TERM`
 
 * :py:const:`SAFETY_CHECK`
@@ -117,14 +122,18 @@ TAG_BODY =  1 << 16
 TAG_CLOSE = 1 << 17
 TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE
 
-DL_TERM = 1 << 18
+STYLE_ITALICS = 1 << 18
+STYLE_BOLD =    1 << 19
+STYLE = STYLE_ITALICS + STYLE_BOLD
+
+DL_TERM = 1 << 20
 
-HAS_TEXT =       1 << 19
-FAIL_ON_TEXT =   1 << 20
-FAIL_NEXT  =     1 << 21
-FAIL_ON_LBRACE = 1 << 22
-FAIL_ON_RBRACE = 1 << 23
-FAIL_ON_EQUALS = 1 << 24
+HAS_TEXT =       1 << 21
+FAIL_ON_TEXT =   1 << 22
+FAIL_NEXT  =     1 << 23
+FAIL_ON_LBRACE = 1 << 24
+FAIL_ON_RBRACE = 1 << 25
+FAIL_ON_EQUALS = 1 << 26
 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
                 FAIL_ON_RBRACE + FAIL_ON_EQUALS)
 
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index d3ce7bd..650e605 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -57,11 +57,11 @@ class Tokenizer(object):
     USES_C = False
     START = object()
     END = object()
-    MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
-               "/", "-", "\n", END]
+    MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";",
+               ":", "/", "-", "\n", END]
     MAX_DEPTH = 40
     MAX_CYCLES = 100000
-    regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
+    regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
     tag_splitter = re.compile(r"([\s\"\\]+)")
 
     def __init__(self):
@@ -629,6 +629,58 @@ class Tokenizer(object):
         else:
             self._emit_all(tag)
 
+    def _really_parse_style(self, context):
+        """Parse wiki-style bold or italics. Raises :py:exc:`BadRoute`."""
+        stack = self._parse(context)
+        markup = "''" if context == contexts.STYLE_ITALICS else "'''"
+        tag = "i" if context == contexts.STYLE_ITALICS else "b"
+
+        self._emit(tokens.TagOpenOpen(wiki_markup=markup))
+        self._emit_text(tag)
+        self._emit(tokens.TagCloseOpen())
+        self._emit_all(stack)
+        self._emit(tokens.TagOpenClose())
+        self._emit_text(tag)
+        self._emit(tokens.TagCloseClose())
+
+    def _parse_style(self):
+        """Parse wiki-style formatting (``''``/``'''`` for italics/bold)."""
+        self._head += 2
+        ticks = 2
+        while self._read() == "'":
+            self._head += 1
+            ticks += 1
+        reset = self._head
+
+        if ticks > 5:
+            self._emit_text("'" * (ticks - 5))
+            ticks = 5
+        elif ticks == 4:
+            self._emit_text("'")
+            ticks = 3
+
+        if ticks == 5:
+            raise NotImplementedError()
+        if ticks == 3:
+            try:
+                return self._really_parse_style(contexts.STYLE_BOLD)
+            except BadRoute:
+                self._emit_text("'")
+                self._head = reset
+        try:
+            self._really_parse_style(contexts.STYLE_ITALICS)
+        except BadRoute:
+            self._emit_text("''")
+            self._head = reset - 1
+
+    def _handle_style_end(self):
+        """Handle the end of wiki-style italics or bold (``''`` or ``'''``)."""
+        self._head += 1 if self._context & contexts.STYLE_ITALICS else 2
+        while self._read(1) == "'":
+            self._emit_text("'")
+            self._head += 1
+        return self._pop()
+
     def _handle_list_marker(self):
         """Handle a list marker at the head (``#``, ``*``, ``;``, ``:``)."""
         markup = self._read()
@@ -667,7 +719,8 @@ class Tokenizer(object):
     def _handle_end(self):
         """Handle the end of the stream of wikitext."""
         fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
-                contexts.HEADING | contexts.COMMENT | contexts.TAG)
+                contexts.HEADING | contexts.COMMENT | contexts.TAG |
+                contexts.STYLE)
         double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
         if self._context & fail:
             if self._context & contexts.TAG_BODY:
@@ -817,6 +870,14 @@ class Tokenizer(object):
                     self._emit_text("<")
             elif this == ">" and self._context & contexts.TAG_CLOSE:
                 return self._handle_tag_close_close()
+            elif this == next == "'":
+                if not self._context & contexts.STYLE and self._can_recurse():
+                    self._parse_style()
+                elif (self._context & contexts.STYLE_ITALICS or
+                      self._read(2) == "'" and self._context & contexts.STYLE_BOLD):
+                    return self._handle_style_end()
+                else:
+                    self._emit_text("'")
             elif self._read(-1) in ("\n", self.START):
                 if this in ("#", "*", ";", ":"):
                     self._handle_list()