Pārlūkot izejas kodu

Started table parsing in PyTokenizer

Started parsing table support and added the start of table support.
This is a big commit (ugh) and it should probably be split up into
multiple smaller ones if possible, but that seems unworkable as of
right now because of all the dependencies. Also breaks tests of
CTokenizer (double ugh) because I haven't started table support there.

May want to pick line by line on this commit later but I need to save
my work for now.
tags/v0.4
David Winegar pirms 10 gadiem
vecāks
revīzija
a8d2983161
4 mainītis faili ar 111 papildinājumiem un 7 dzēšanām
  1. +1
    -1
      mwparserfromhell/definitions.py
  2. +7
    -1
      mwparserfromhell/parser/contexts.py
  3. +71
    -5
      mwparserfromhell/parser/tokenizer.py
  4. +32
    -0
      tests/tokenizer/tables.mwtest

+ 1
- 1
mwparserfromhell/definitions.py Parādīt failu

@@ -52,7 +52,7 @@ INVISIBLE_TAGS = [


# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 # [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] SINGLE_ONLY = ["br", "hr", "meta", "link", "img"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"]


MARKUP_TO_HTML = { MARKUP_TO_HTML = {
"#": "li", "#": "li",


+ 7
- 1
mwparserfromhell/parser/contexts.py Parādīt failu

@@ -155,13 +155,19 @@ FAIL_ON_EQUALS = 1 << 29
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS) FAIL_ON_RBRACE + FAIL_ON_EQUALS)


TABLE_OPEN = 1 << 30
TABLE_CELL_LINE = 1 << 31
TABLE_HEADER_LINE = 1 << 32
TABLE_CELL_OPEN = 1 << 33
TABLE_CELL_STYLE_POSSIBLE = 1 << 34

# Global contexts: # Global contexts:


GL_HEADING = 1 << 0 GL_HEADING = 1 << 0


# Aggregate contexts: # Aggregate contexts:


FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE_OPEN
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE +
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE)
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE


+ 71
- 5
mwparserfromhell/parser/tokenizer.py Parādīt failu

@@ -1002,6 +1002,39 @@ class Tokenizer(object):
self._fail_route() self._fail_route()
return self._pop() return self._pop()


def _handle_table_start(self):
"""Handle the start of a table."""
# TODO - fail all other contexts on start?
self._head += 2
reset = self._head - 1
try:
table = self._parse(contexts.TABLE_OPEN)
except BadRoute:
self._head = reset
self._emit_text("{|")
else:
self._emit_style_tag("table", "{|", table)

def _handle_table_end(self):
self._head += 2
return self._pop()

def _handle_table_row(self):
self._head += 2
self._emit(tokens.TagOpenOpen(wiki_markup="{-"))
self._emit_text("tr")
self._emit(tokens.TagCloseSelfclose())
self._context &= ~contexts.TABLE_CELL_OPEN

def _handle_table_cell(self):
pass

def _handle_header_cell(self):
pass

def _handle_cell_style(self):
pass

def _verify_safe(self, this): def _verify_safe(self, this):
"""Make sure we are not trying to write an invalid character.""" """Make sure we are not trying to write an invalid character."""
context = self._context context = self._context
@@ -1144,15 +1177,48 @@ class Tokenizer(object):
result = self._parse_style() result = self._parse_style()
if result is not None: if result is not None:
return result return result
elif self._read(-1) in ("\n", self.START):
if this in ("#", "*", ";", ":"):
elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"):
self._handle_list() self._handle_list()
elif this == next == self._read(2) == self._read(3) == "-":
elif self._read(-1) in ("\n", self.START) and this == next == self._read(2) == self._read(3) == "-":
self._handle_hr() self._handle_hr()
else:
self._emit_text(this)
elif this in ("\n", ":") and self._context & contexts.DL_TERM: elif this in ("\n", ":") and self._context & contexts.DL_TERM:
self._handle_dl_term() self._handle_dl_term()

elif (this == "{" and next == "|" and (self._read(-1) in ("\n", self.START)) or
(self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")):
if self._can_recurse():
self._handle_table_start()
else:
self._emit_text("{|")
elif self._context & contexts.TABLE_OPEN:
if this == "|" and next == "}":
return self._handle_table_end()
elif this == "|" and next == "|" and self._context & contexts.TABLE_CELL_LINE:
self._handle_table_cell()
elif this == "|" and next == "|" and self._context & contexts.TABLE_HEADER_LINE:
self._handle_header_cell()
elif this == "!" and next == "!" and self._context & contexts.TABLE_HEADER_LINE:
self._handle_header_cell()
elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE:
self._handle_cell_style()
# on newline, clear out cell line contexts
elif this == "\n" and self._context & (contexts.TABLE_CELL_LINE | contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_STYLE_POSSIBLE):
self._context &= (~contexts.TABLE_CELL_LINE & ~contexts.TABLE_HEADER_LINE & ~contexts.TABLE_CELL_STYLE_POSSIBLE)
self._emit_text(this)
# newline or whitespace/newline
elif (self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")):
if this == "|" and next == "-":
self._handle_table_row()
elif this == "|" and self._can_recurse():
self._handle_table_cell()
elif this == "!" and self._can_recurse():
self._handle_header_cell()
else:
self._emit_text(this)
else:
self._emit_text(this)

else: else:
self._emit_text(this) self._emit_text(this)
self._head += 1 self._head += 1


+ 32
- 0
tests/tokenizer/tables.mwtest Parādīt failu

@@ -0,0 +1,32 @@
name: empty_table
label: Parsing an empty table.
input: "{|\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()]

---

name: inline_table
label: Correctly handle tables with close on the same line.
input: "{||}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(), Text(text="table"), TagCloseClose()]

---

name: no_table_close_simple
label: Handle case when there is no table close.
input: "{| "
output: [Text(text="{| ")]

---

name: leading_whitespace_table
label: Handle leading whitespace for a table.
input: "foo \n \t {|\n|}"
output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()]

---

name: leading_characters_table
label: Don't parse as a table when leading characters are not newline or whitespace.
input: "foo \n foo \t {|\n|}"
output: [Text(text="foo \n foo \t {|\n|}")]

Notiek ielāde…
Atcelt
Saglabāt