Procházet zdrojové kódy

Started table parsing in PyTokenizer

Started parsing table support and added the start of table support.
This is a big commit (ugh) and it should probably be split up into
multiple smaller ones if possible, but that seems unworkable as of
right now because of all the dependencies. Also breaks tests of
CTokenizer (double ugh) because I haven't started table support there.

May want to pick line by line on this commit later but I need to save
my work for now.
tags/v0.4
David Winegar před 10 roky
rodič
revize
a8d2983161
4 změnil soubory, kde provedl 111 přidání a 7 odebrání
  1. +1
    -1
      mwparserfromhell/definitions.py
  2. +7
    -1
      mwparserfromhell/parser/contexts.py
  3. +71
    -5
      mwparserfromhell/parser/tokenizer.py
  4. +32
    -0
      tests/tokenizer/tables.mwtest

+ 1
- 1
mwparserfromhell/definitions.py Zobrazit soubor

@@ -52,7 +52,7 @@ INVISIBLE_TAGS = [

# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"]

MARKUP_TO_HTML = {
"#": "li",


+ 7
- 1
mwparserfromhell/parser/contexts.py Zobrazit soubor

@@ -155,13 +155,19 @@ FAIL_ON_EQUALS = 1 << 29
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS)

TABLE_OPEN = 1 << 30
TABLE_CELL_LINE = 1 << 31
TABLE_HEADER_LINE = 1 << 32
TABLE_CELL_OPEN = 1 << 33
TABLE_CELL_STYLE_POSSIBLE = 1 << 34

# Global contexts:

GL_HEADING = 1 << 0

# Aggregate contexts:

FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE_OPEN
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE +
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE)
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE


+ 71
- 5
mwparserfromhell/parser/tokenizer.py Zobrazit soubor

@@ -1002,6 +1002,39 @@ class Tokenizer(object):
self._fail_route()
return self._pop()

def _handle_table_start(self):
"""Handle the start of a table."""
# TODO - fail all other contexts on start?
self._head += 2
reset = self._head - 1
try:
table = self._parse(contexts.TABLE_OPEN)
except BadRoute:
self._head = reset
self._emit_text("{|")
else:
self._emit_style_tag("table", "{|", table)

def _handle_table_end(self):
self._head += 2
return self._pop()

def _handle_table_row(self):
self._head += 2
self._emit(tokens.TagOpenOpen(wiki_markup="{-"))
self._emit_text("tr")
self._emit(tokens.TagCloseSelfclose())
self._context &= ~contexts.TABLE_CELL_OPEN

def _handle_table_cell(self):
pass

def _handle_header_cell(self):
pass

def _handle_cell_style(self):
pass

def _verify_safe(self, this):
"""Make sure we are not trying to write an invalid character."""
context = self._context
@@ -1144,15 +1177,48 @@ class Tokenizer(object):
result = self._parse_style()
if result is not None:
return result
elif self._read(-1) in ("\n", self.START):
if this in ("#", "*", ";", ":"):
elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"):
self._handle_list()
elif this == next == self._read(2) == self._read(3) == "-":
elif self._read(-1) in ("\n", self.START) and this == next == self._read(2) == self._read(3) == "-":
self._handle_hr()
else:
self._emit_text(this)
elif this in ("\n", ":") and self._context & contexts.DL_TERM:
self._handle_dl_term()

elif (this == "{" and next == "|" and (self._read(-1) in ("\n", self.START)) or
(self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")):
if self._can_recurse():
self._handle_table_start()
else:
self._emit_text("{|")
elif self._context & contexts.TABLE_OPEN:
if this == "|" and next == "}":
return self._handle_table_end()
elif this == "|" and next == "|" and self._context & contexts.TABLE_CELL_LINE:
self._handle_table_cell()
elif this == "|" and next == "|" and self._context & contexts.TABLE_HEADER_LINE:
self._handle_header_cell()
elif this == "!" and next == "!" and self._context & contexts.TABLE_HEADER_LINE:
self._handle_header_cell()
elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE:
self._handle_cell_style()
# on newline, clear out cell line contexts
elif this == "\n" and self._context & (contexts.TABLE_CELL_LINE | contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_STYLE_POSSIBLE):
self._context &= (~contexts.TABLE_CELL_LINE & ~contexts.TABLE_HEADER_LINE & ~contexts.TABLE_CELL_STYLE_POSSIBLE)
self._emit_text(this)
# newline or whitespace/newline
elif (self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")):
if this == "|" and next == "-":
self._handle_table_row()
elif this == "|" and self._can_recurse():
self._handle_table_cell()
elif this == "!" and self._can_recurse():
self._handle_header_cell()
else:
self._emit_text(this)
else:
self._emit_text(this)

else:
self._emit_text(this)
self._head += 1


+ 32
- 0
tests/tokenizer/tables.mwtest Zobrazit soubor

@@ -0,0 +1,32 @@
name: empty_table
label: Parsing an empty table.
input: "{|\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()]

---

name: inline_table
label: Correctly handle tables with close on the same line.
input: "{||}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(), Text(text="table"), TagCloseClose()]

---

name: no_table_close_simple
label: Handle case when there is no table close.
input: "{| "
output: [Text(text="{| ")]

---

name: leading_whitespace_table
label: Handle leading whitespace for a table.
input: "foo \n \t {|\n|}"
output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()]

---

name: leading_characters_table
label: Don't parse as a table when leading characters are not newline or whitespace.
input: "foo \n foo \t {|\n|}"
output: [Text(text="foo \n foo \t {|\n|}")]

Načítá se…
Zrušit
Uložit