Browse Source

Add hooks for some ext link stuff; add a INVALID_LINK aggregate context.

tags/v0.3
Ben Kurtovic 11 years ago
parent
commit
cbf67c7842
5 changed files with 96 additions and 64 deletions
  1. +11
    -9
      mwparserfromhell/parser/contexts.py
  2. +9
    -6
      mwparserfromhell/parser/tokenizer.c
  3. +55
    -45
      mwparserfromhell/parser/tokenizer.h
  4. +14
    -4
      mwparserfromhell/parser/tokenizer.py
  5. +7
    -0
      tests/tokenizer/integration.mwtest

+ 11
- 9
mwparserfromhell/parser/contexts.py View File

@@ -51,11 +51,11 @@ Local (stack-specific) contexts:
* :py:const:`WIKILINK_TITLE` * :py:const:`WIKILINK_TITLE`
* :py:const:`WIKILINK_TEXT` * :py:const:`WIKILINK_TEXT`


* :py:const:`EXTERNAL_LINK`
* :py:const:`EXT_LINK`


* :py:const:`EXTERNAL_LINK_URL`
* :py:const:`EXTERNAL_LINK_TITLE`
* :py:const:`EXTERNAL_LINK_BRACKETS`
* :py:const:`EXT_LINK_URL`
* :py:const:`EXT_LINK_TITLE`
* :py:const:`EXT_LINK_BRACKETS`


* :py:const:`HEADING` * :py:const:`HEADING`


@@ -100,6 +100,7 @@ Aggregate contexts:
* :py:const:`FAIL` * :py:const:`FAIL`
* :py:const:`UNSAFE` * :py:const:`UNSAFE`
* :py:const:`DOUBLE` * :py:const:`DOUBLE`
* :py:const:`INVALID_LINK`


""" """


@@ -118,10 +119,10 @@ WIKILINK_TITLE = 1 << 5
WIKILINK_TEXT = 1 << 6 WIKILINK_TEXT = 1 << 6
WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT


EXTERNAL_LINK_URL = 1 << 7
EXTERNAL_LINK_TITLE = 1 << 8
EXTERNAL_LINK_BRACKETS = 1 << 9
EXTERNAL_LINK = EXTERNAL_LINK_URL + EXTERNAL_LINK_TITLE
EXT_LINK_URL = 1 << 7
EXT_LINK_TITLE = 1 << 8
EXT_LINK_BRACKETS = 1 << 9
EXT_LINK = EXT_LINK_URL + EXT_LINK_TITLE + EXT_LINK_BRACKETS


HEADING_LEVEL_1 = 1 << 10 HEADING_LEVEL_1 = 1 << 10
HEADING_LEVEL_2 = 1 << 11 HEADING_LEVEL_2 = 1 << 11
@@ -161,7 +162,8 @@ GL_HEADING = 1 << 0


# Aggregate contexts: # Aggregate contexts:


FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXTERNAL_LINK + HEADING + TAG + STYLE
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK + HEADING + TAG + STYLE
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME +
TAG_CLOSE) TAG_CLOSE)
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE
INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URL

+ 9
- 6
mwparserfromhell/parser/tokenizer.c View File

@@ -2192,9 +2192,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (Tokenizer_emit_char(self, this)) if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
} }
else if (this == next && next == *"[") {
if (!(this_context & LC_WIKILINK_TITLE) &&
Tokenizer_CAN_RECURSE(self)) {
else if (this == next && next == *"[" && Tokenizer_CAN_RECURSE(self)) {
if (!(this_context & AGG_INVALID_LINK)) {
if (Tokenizer_parse_wikilink(self)) if (Tokenizer_parse_wikilink(self))
return NULL; return NULL;
} }
@@ -2243,9 +2242,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
return NULL; return NULL;
} }
} }
else if (this == *"<") {
if (!(this_context & LC_TAG_CLOSE) &&
Tokenizer_CAN_RECURSE(self)) {
else if (this == *"<" && !(this_context & LC_TAG_CLOSE)) {
if (Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_parse_tag(self)) if (Tokenizer_parse_tag(self))
return NULL; return NULL;
} }
@@ -2389,6 +2387,11 @@ static int load_tokens(void)
WikilinkSeparator = PyObject_GetAttrString(tokens, "WikilinkSeparator"); WikilinkSeparator = PyObject_GetAttrString(tokens, "WikilinkSeparator");
WikilinkClose = PyObject_GetAttrString(tokens, "WikilinkClose"); WikilinkClose = PyObject_GetAttrString(tokens, "WikilinkClose");


ExternalLinkOpen = PyObject_GetAttrString(tokens, "ExternalLinkOpen");
ExternalLinkSeparator = PyObject_GetAttrString(tokens,
"ExternalLinkSeparator");
ExternalLinkClose = PyObject_GetAttrString(tokens, "ExternalLinkClose");

HTMLEntityStart = PyObject_GetAttrString(tokens, "HTMLEntityStart"); HTMLEntityStart = PyObject_GetAttrString(tokens, "HTMLEntityStart");
HTMLEntityNumeric = PyObject_GetAttrString(tokens, "HTMLEntityNumeric"); HTMLEntityNumeric = PyObject_GetAttrString(tokens, "HTMLEntityNumeric");
HTMLEntityHex = PyObject_GetAttrString(tokens, "HTMLEntityHex"); HTMLEntityHex = PyObject_GetAttrString(tokens, "HTMLEntityHex");


+ 55
- 45
mwparserfromhell/parser/tokenizer.h View File

@@ -82,6 +82,10 @@ static PyObject* WikilinkOpen;
static PyObject* WikilinkSeparator; static PyObject* WikilinkSeparator;
static PyObject* WikilinkClose; static PyObject* WikilinkClose;


static PyObject* ExternalLinkOpen;
static PyObject* ExternalLinkSeparator;
static PyObject* ExternalLinkClose;

static PyObject* HTMLEntityStart; static PyObject* HTMLEntityStart;
static PyObject* HTMLEntityNumeric; static PyObject* HTMLEntityNumeric;
static PyObject* HTMLEntityHex; static PyObject* HTMLEntityHex;
@@ -104,48 +108,53 @@ static PyObject* TagCloseClose;


/* Local contexts: */ /* Local contexts: */


#define LC_TEMPLATE 0x0000007
#define LC_TEMPLATE_NAME 0x0000001
#define LC_TEMPLATE_PARAM_KEY 0x0000002
#define LC_TEMPLATE_PARAM_VALUE 0x0000004

#define LC_ARGUMENT 0x0000018
#define LC_ARGUMENT_NAME 0x0000008
#define LC_ARGUMENT_DEFAULT 0x0000010

#define LC_WIKILINK 0x0000060
#define LC_WIKILINK_TITLE 0x0000020
#define LC_WIKILINK_TEXT 0x0000040

#define LC_HEADING 0x0001F80
#define LC_HEADING_LEVEL_1 0x0000080
#define LC_HEADING_LEVEL_2 0x0000100
#define LC_HEADING_LEVEL_3 0x0000200
#define LC_HEADING_LEVEL_4 0x0000400
#define LC_HEADING_LEVEL_5 0x0000800
#define LC_HEADING_LEVEL_6 0x0001000

#define LC_TAG 0x001E000
#define LC_TAG_OPEN 0x0002000
#define LC_TAG_ATTR 0x0004000
#define LC_TAG_BODY 0x0008000
#define LC_TAG_CLOSE 0x0010000

#define LC_STYLE 0x01E0000
#define LC_STYLE_ITALICS 0x0020000
#define LC_STYLE_BOLD 0x0040000
#define LC_STYLE_PASS_AGAIN 0x0080000
#define LC_STYLE_SECOND_PASS 0x0100000

#define LC_DLTERM 0x0200000

#define LC_SAFETY_CHECK 0xFC00000
#define LC_HAS_TEXT 0x0400000
#define LC_FAIL_ON_TEXT 0x0800000
#define LC_FAIL_NEXT 0x1000000
#define LC_FAIL_ON_LBRACE 0x2000000
#define LC_FAIL_ON_RBRACE 0x4000000
#define LC_FAIL_ON_EQUALS 0x8000000
#define LC_TEMPLATE 0x00000007
#define LC_TEMPLATE_NAME 0x00000001
#define LC_TEMPLATE_PARAM_KEY 0x00000002
#define LC_TEMPLATE_PARAM_VALUE 0x00000004

#define LC_ARGUMENT 0x00000018
#define LC_ARGUMENT_NAME 0x00000008
#define LC_ARGUMENT_DEFAULT 0x00000010

#define LC_WIKILINK 0x00000060
#define LC_WIKILINK_TITLE 0x00000020
#define LC_WIKILINK_TEXT 0x00000040

#define LC_EXT_LINK 0x00000380
#define LC_EXT_LINK_URL 0x00000080
#define LC_EXT_LINK_TITLE 0x00000100
#define LC_EXT_LINK_BRACKETS 0x00000200

#define LC_HEADING 0x0000FC00
#define LC_HEADING_LEVEL_1 0x00000400
#define LC_HEADING_LEVEL_2 0x00000800
#define LC_HEADING_LEVEL_3 0x00001000
#define LC_HEADING_LEVEL_4 0x00002000
#define LC_HEADING_LEVEL_5 0x00004000
#define LC_HEADING_LEVEL_6 0x00008000

#define LC_TAG 0x000F0000
#define LC_TAG_OPEN 0x00010000
#define LC_TAG_ATTR 0x00020000
#define LC_TAG_BODY 0x00040000
#define LC_TAG_CLOSE 0x00080000

#define LC_STYLE 0x00F00000
#define LC_STYLE_ITALICS 0x00100000
#define LC_STYLE_BOLD 0x00200000
#define LC_STYLE_PASS_AGAIN 0x00400000
#define LC_STYLE_SECOND_PASS 0x00800000

#define LC_DLTERM 0x01000000

#define LC_SAFETY_CHECK 0x7E000000
#define LC_HAS_TEXT 0x02000000
#define LC_FAIL_ON_TEXT 0x04000000
#define LC_FAIL_NEXT 0x08000000
#define LC_FAIL_ON_LBRACE 0x10000000
#define LC_FAIL_ON_RBRACE 0x20000000
#define LC_FAIL_ON_EQUALS 0x40000000


/* Global contexts: */ /* Global contexts: */


@@ -153,9 +162,10 @@ static PyObject* TagCloseClose;


/* Aggregate contexts: */ /* Aggregate contexts: */


#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)
#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URL)


/* Tag contexts: */ /* Tag contexts: */




+ 14
- 4
mwparserfromhell/parser/tokenizer.py View File

@@ -311,6 +311,11 @@ class Tokenizer(object):
self._head += 1 self._head += 1
return self._pop() return self._pop()


def _parse_external_link(self, brackets):
"""Parse an external link at the head of the wikicode string."""
self._emit_text(self._read())
# raise NotImplementedError()

def _parse_heading(self): def _parse_heading(self):
"""Parse a section heading at the head of the wikicode string.""" """Parse a section heading at the head of the wikicode string."""
self._global |= contexts.GL_HEADING self._global |= contexts.GL_HEADING
@@ -898,8 +903,8 @@ class Tokenizer(object):
return self._handle_argument_end() return self._handle_argument_end()
else: else:
self._emit_text("}") self._emit_text("}")
elif this == next == "[":
if not self._context & contexts.WIKILINK_TITLE and self._can_recurse():
elif this == next == "[" and self._can_recurse():
if not self._context & contexts.INVALID_LINK:
self._parse_wikilink() self._parse_wikilink()
else: else:
self._emit_text("[") self._emit_text("[")
@@ -907,6 +912,11 @@ class Tokenizer(object):
self._handle_wikilink_separator() self._handle_wikilink_separator()
elif this == next == "]" and self._context & contexts.WIKILINK: elif this == next == "]" and self._context & contexts.WIKILINK:
return self._handle_wikilink_end() return self._handle_wikilink_end()
elif this == "[" and not self._context & contexts.INVALID_LINK: ## or this == ":"
if self._can_recurse():
self._parse_external_link(brackets=this == "[")
else:
self._emit_text("[")
elif this == "=" and not self._global & contexts.GL_HEADING: elif this == "=" and not self._global & contexts.GL_HEADING:
if self._read(-1) in ("\n", self.START): if self._read(-1) in ("\n", self.START):
self._parse_heading() self._parse_heading()
@@ -928,8 +938,8 @@ class Tokenizer(object):
self._handle_tag_open_close() self._handle_tag_open_close()
else: else:
self._handle_invalid_tag_start() self._handle_invalid_tag_start()
elif this == "<":
if not self._context & contexts.TAG_CLOSE and self._can_recurse():
elif this == "<" and not self._context & contexts.TAG_CLOSE:
if self._can_recurse():
self._parse_tag() self._parse_tag()
else: else:
self._emit_text("<") self._emit_text("<")


+ 7
- 0
tests/tokenizer/integration.mwtest View File

@@ -12,6 +12,13 @@ output: [TemplateOpen(), ArgumentOpen(), ArgumentOpen(), Text(text="foo"), Argum


--- ---


name: link_in_template_name
label: a wikilink inside a template name, which breaks the template
input: "{{foo[[bar]]}}"
output: [Text(text="{{foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="}}")]

---

name: rich_heading name: rich_heading
label: a heading with templates/wikilinks in it label: a heading with templates/wikilinks in it
input: "== Head{{ing}} [[with]] {{{funky|{{stuf}}}}} ==" input: "== Head{{ing}} [[with]] {{{funky|{{stuf}}}}} =="


Loading…
Cancel
Save