@@ -5,7 +5,8 @@ python: | |||||
- 3.2 | - 3.2 | ||||
- 3.3 | - 3.3 | ||||
- 3.4 | - 3.4 | ||||
- nightly | |||||
- 3.5-dev | |||||
sudo: false | |||||
install: | install: | ||||
- pip install coveralls | - pip install coveralls | ||||
- python setup.py build | - python setup.py build | ||||
@@ -6,6 +6,11 @@ v0.4.1 (unreleased): | |||||
- Added support for Python 3.5. | - Added support for Python 3.5. | ||||
- '<' and '>' are now disallowed in wikilink titles and template names. This | - '<' and '>' are now disallowed in wikilink titles and template names. This | ||||
includes when denoting tags, but not comments. | includes when denoting tags, but not comments. | ||||
- Fixed the behavior of preserve_spacing in Template.add() and keep_field in | |||||
Template.remove() on parameters with hidden keys. | |||||
- Fixed parser bugs involving: | |||||
- templates with completely blank names; | |||||
- templates with newlines and comments. | |||||
- Heavy refactoring and fixes to the C tokenizer. | - Heavy refactoring and fixes to the C tokenizer. | ||||
- Fixed some bugs in the release scripts. | - Fixed some bugs in the release scripts. | ||||
@@ -139,7 +139,7 @@ If you're not using a library, you can parse any page using the following code | |||||
from urllib.parse import urlencode | from urllib.parse import urlencode | ||||
from urllib.request import urlopen | from urllib.request import urlopen | ||||
import mwparserfromhell | import mwparserfromhell | ||||
API_URL = "http://en.wikipedia.org/w/api.php" | |||||
API_URL = "https://en.wikipedia.org/w/api.php" | |||||
def parse(title): | def parse(title): | ||||
data = {"action": "query", "prop": "revisions", "rvlimit": 1, | data = {"action": "query", "prop": "revisions", "rvlimit": 1, | ||||
@@ -13,6 +13,13 @@ Unreleased | |||||
- Added support for Python 3.5. | - Added support for Python 3.5. | ||||
- ``<`` and ``>`` are now disallowed in wikilink titles and template names. | - ``<`` and ``>`` are now disallowed in wikilink titles and template names. | ||||
This includes when denoting tags, but not comments. | This includes when denoting tags, but not comments. | ||||
- Fixed the behavior of *preserve_spacing* in :func:`~.Template.add` and | |||||
*keep_field* in :func:`~.Template.remove` on parameters with hidden keys. | |||||
- Fixed parser bugs involving: | |||||
- templates with completely blank names; | |||||
- templates with newlines and comments. | |||||
- Heavy refactoring and fixes to the C tokenizer. | - Heavy refactoring and fixes to the C tokenizer. | ||||
- Fixed some bugs in the release scripts. | - Fixed some bugs in the release scripts. | ||||
@@ -25,7 +25,7 @@ If you're not using a library, you can parse any page using the following code | |||||
from urllib.parse import urlencode | from urllib.parse import urlencode | ||||
from urllib.request import urlopen | from urllib.request import urlopen | ||||
import mwparserfromhell | import mwparserfromhell | ||||
API_URL = "http://en.wikipedia.org/w/api.php" | |||||
API_URL = "https://en.wikipedia.org/w/api.php" | |||||
def parse(title): | def parse(title): | ||||
data = {"action": "query", "prop": "revisions", "rvlimit": 1, | data = {"action": "query", "prop": "revisions", "rvlimit": 1, | ||||
@@ -82,21 +82,11 @@ class Template(Node): | |||||
if char in node: | if char in node: | ||||
code.replace(node, node.replace(char, replacement), False) | code.replace(node, node.replace(char, replacement), False) | ||||
def _blank_param_value(self, value): | |||||
"""Remove the content from *value* while keeping its whitespace. | |||||
Replace *value*\ 's nodes with two text nodes, the first containing | |||||
whitespace from before its content and the second containing whitespace | |||||
from after its content. | |||||
""" | |||||
match = re.search(r"^(\s*).*?(\s*)$", str(value), FLAGS) | |||||
value.nodes = [Text(match.group(1)), Text(match.group(2))] | |||||
def _select_theory(self, theories): | def _select_theory(self, theories): | ||||
"""Return the most likely spacing convention given different options. | """Return the most likely spacing convention given different options. | ||||
Given a dictionary of convention options as keys and their occurrence as | |||||
values, return the convention that occurs the most, or ``None`` if | |||||
Given a dictionary of convention options as keys and their occurrence | |||||
as values, return the convention that occurs the most, or ``None`` if | |||||
there is no clear preferred style. | there is no clear preferred style. | ||||
""" | """ | ||||
if theories: | if theories: | ||||
@@ -129,34 +119,47 @@ class Template(Node): | |||||
after = self._select_theory(after_theories) | after = self._select_theory(after_theories) | ||||
return before, after | return before, after | ||||
def _remove_with_field(self, param, i, name): | |||||
"""Return True if a parameter name should be kept, otherwise False.""" | |||||
if param.showkey: | |||||
following = self.params[i+1:] | |||||
better_matches = [after.name.strip() == name and not after.showkey for after in following] | |||||
if any(better_matches): | |||||
return False | |||||
return True | |||||
def _remove_without_field(self, param, i): | |||||
"""Return False if a parameter name should be kept, otherwise True.""" | |||||
if not param.showkey: | |||||
dependents = [not after.showkey for after in self.params[i+1:]] | |||||
if any(dependents): | |||||
return False | |||||
return True | |||||
def _blank_param_value(self, value): | |||||
"""Remove the content from *value* while keeping its whitespace. | |||||
Replace *value*\ 's nodes with two text nodes, the first containing | |||||
whitespace from before its content and the second containing whitespace | |||||
from after its content. | |||||
""" | |||||
match = re.search(r"^(\s*).*?(\s*)$", str(value), FLAGS) | |||||
value.nodes = [Text(match.group(1)), Text(match.group(2))] | |||||
def _fix_dependendent_params(self, i): | |||||
"""Unhide keys if necessary after removing the param at index *i*.""" | |||||
if not self.params[i].showkey: | |||||
for param in self.params[i + 1:]: | |||||
if not param.showkey: | |||||
param.showkey = True | |||||
def _remove_exact(self, needle, keep_field): | def _remove_exact(self, needle, keep_field): | ||||
"""Remove a specific parameter, *needle*, from the template.""" | """Remove a specific parameter, *needle*, from the template.""" | ||||
for i, param in enumerate(self.params): | for i, param in enumerate(self.params): | ||||
if param is needle: | if param is needle: | ||||
if keep_field or not self._remove_without_field(param, i): | |||||
if keep_field: | |||||
self._blank_param_value(param.value) | self._blank_param_value(param.value) | ||||
else: | else: | ||||
self._fix_dependendent_params(i) | |||||
self.params.pop(i) | self.params.pop(i) | ||||
return | return | ||||
raise ValueError(needle) | raise ValueError(needle) | ||||
def _should_remove(self, i, name): | |||||
"""Look ahead for a parameter with the same name, but hidden. | |||||
If one exists, we should remove the given one rather than blanking it. | |||||
""" | |||||
if self.params[i].showkey: | |||||
following = self.params[i + 1:] | |||||
better_matches = [after.name.strip() == name and not after.showkey | |||||
for after in following] | |||||
return any(better_matches) | |||||
return False | |||||
@property | @property | ||||
def name(self): | def name(self): | ||||
"""The name of the template, as a :class:`.Wikicode` object.""" | """The name of the template, as a :class:`.Wikicode` object.""" | ||||
@@ -213,26 +216,25 @@ class Template(Node): | |||||
:func:`.utils.parse_anything`; pipes and equal signs are automatically | :func:`.utils.parse_anything`; pipes and equal signs are automatically | ||||
escaped from *value* when appropriate. | escaped from *value* when appropriate. | ||||
If *name* is already a parameter in the template, we'll replace its | |||||
value. | |||||
If *showkey* is given, this will determine whether or not to show the | If *showkey* is given, this will determine whether or not to show the | ||||
parameter's name (e.g., ``{{foo|bar}}``'s parameter has a name of | parameter's name (e.g., ``{{foo|bar}}``'s parameter has a name of | ||||
``"1"`` but it is hidden); otherwise, we'll make a safe and intelligent | ``"1"`` but it is hidden); otherwise, we'll make a safe and intelligent | ||||
guess. | guess. | ||||
If *name* is already a parameter in the template, we'll replace its | |||||
value while keeping the same whitespace around it. We will also try to | |||||
guess the dominant spacing convention when adding a new parameter using | |||||
:meth:`_get_spacing_conventions`. | |||||
If *before* is given (either a :class:`.Parameter` object or a name), | If *before* is given (either a :class:`.Parameter` object or a name), | ||||
then we will place the parameter immediately before this one. | then we will place the parameter immediately before this one. | ||||
Otherwise, it will be added at the end. If *before* is a name and | Otherwise, it will be added at the end. If *before* is a name and | ||||
exists multiple times in the template, we will place it before the last | exists multiple times in the template, we will place it before the last | ||||
occurrence. If *before* is not in the template, :exc:`ValueError` is | occurrence. If *before* is not in the template, :exc:`ValueError` is | ||||
raised. The argument is ignored if the new parameter already exists. | |||||
raised. The argument is ignored if *name* is an existing parameter. | |||||
If *preserve_spacing* is ``False``, we will avoid preserving spacing | |||||
conventions when changing the value of an existing parameter or when | |||||
adding a new one. | |||||
If *preserve_spacing* is ``True``, we will try to preserve whitespace | |||||
conventions around the parameter, whether it is new or we are updating | |||||
an existing value. It is disabled for parameters with hidden keys, | |||||
since MediaWiki doesn't strip whitespace in this case. | |||||
""" | """ | ||||
name, value = parse_anything(name), parse_anything(value) | name, value = parse_anything(name), parse_anything(value) | ||||
self._surface_escape(value, "|") | self._surface_escape(value, "|") | ||||
@@ -245,7 +247,7 @@ class Template(Node): | |||||
if not existing.showkey: | if not existing.showkey: | ||||
self._surface_escape(value, "=") | self._surface_escape(value, "=") | ||||
nodes = existing.value.nodes | nodes = existing.value.nodes | ||||
if preserve_spacing: | |||||
if preserve_spacing and existing.showkey: | |||||
for i in range(2): # Ignore empty text nodes | for i in range(2): # Ignore empty text nodes | ||||
if not nodes[i]: | if not nodes[i]: | ||||
nodes[i] = None | nodes[i] = None | ||||
@@ -271,7 +273,7 @@ class Template(Node): | |||||
if not showkey: | if not showkey: | ||||
self._surface_escape(value, "=") | self._surface_escape(value, "=") | ||||
if preserve_spacing: | |||||
if preserve_spacing and showkey: | |||||
before_n, after_n = self._get_spacing_conventions(use_names=True) | before_n, after_n = self._get_spacing_conventions(use_names=True) | ||||
before_v, after_v = self._get_spacing_conventions(use_names=False) | before_v, after_v = self._get_spacing_conventions(use_names=False) | ||||
name = parse_anything([before_n, name, after_n]) | name = parse_anything([before_n, name, after_n]) | ||||
@@ -294,36 +296,39 @@ class Template(Node): | |||||
and :meth:`get`. | and :meth:`get`. | ||||
If *keep_field* is ``True``, we will keep the parameter's name, but | If *keep_field* is ``True``, we will keep the parameter's name, but | ||||
blank its value. Otherwise, we will remove the parameter completely | |||||
*unless* other parameters are dependent on it (e.g. removing ``bar`` | |||||
from ``{{foo|bar|baz}}`` is unsafe because ``{{foo|baz}}`` is not what | |||||
we expected, so ``{{foo||baz}}`` will be produced instead). | |||||
blank its value. Otherwise, we will remove the parameter completely. | |||||
When removing a parameter with a hidden name, subsequent parameters | |||||
with hidden names will be made visible. For example, removing ``bar`` | |||||
from ``{{foo|bar|baz}}`` produces ``{{foo|2=baz}}`` because | |||||
``{{foo|baz}}`` is incorrect. | |||||
If the parameter shows up multiple times in the template and *param* is | If the parameter shows up multiple times in the template and *param* is | ||||
not a :class:`.Parameter` object, we will remove all instances of it | not a :class:`.Parameter` object, we will remove all instances of it | ||||
(and keep only one if *keep_field* is ``True`` - the first instance if | |||||
none have dependents, otherwise the one with dependents will be kept). | |||||
(and keep only one if *keep_field* is ``True`` - either the one with a | |||||
hidden name, if it exists, or the first instance). | |||||
""" | """ | ||||
if isinstance(param, Parameter): | if isinstance(param, Parameter): | ||||
return self._remove_exact(param, keep_field) | return self._remove_exact(param, keep_field) | ||||
name = str(param).strip() | name = str(param).strip() | ||||
removed = False | removed = False | ||||
to_remove = [] | to_remove = [] | ||||
for i, param in enumerate(self.params): | for i, param in enumerate(self.params): | ||||
if param.name.strip() == name: | if param.name.strip() == name: | ||||
if keep_field: | if keep_field: | ||||
if self._remove_with_field(param, i, name): | |||||
self._blank_param_value(param.value) | |||||
keep_field = False | |||||
else: | |||||
to_remove.append(i) | |||||
else: | |||||
if self._remove_without_field(param, i): | |||||
if self._should_remove(i, name): | |||||
to_remove.append(i) | to_remove.append(i) | ||||
else: | else: | ||||
self._blank_param_value(param.value) | self._blank_param_value(param.value) | ||||
keep_field = False | |||||
else: | |||||
self._fix_dependendent_params(i) | |||||
to_remove.append(i) | |||||
if not removed: | if not removed: | ||||
removed = True | removed = True | ||||
if not removed: | if not removed: | ||||
raise ValueError(name) | raise ValueError(name) | ||||
for i in reversed(to_remove): | for i in reversed(to_remove): | ||||
@@ -40,11 +40,11 @@ class ParserError(Exception): | |||||
from .builder import Builder | from .builder import Builder | ||||
from .tokenizer import Tokenizer | |||||
try: | try: | ||||
from ._tokenizer import CTokenizer | from ._tokenizer import CTokenizer | ||||
use_c = True | use_c = True | ||||
except ImportError: | except ImportError: | ||||
from .tokenizer import Tokenizer | |||||
CTokenizer = None | CTokenizer = None | ||||
use_c = False | use_c = False | ||||
@@ -70,6 +70,7 @@ class Parser(object): | |||||
if use_c and CTokenizer: | if use_c and CTokenizer: | ||||
self._tokenizer = CTokenizer() | self._tokenizer = CTokenizer() | ||||
else: | else: | ||||
from .tokenizer import Tokenizer | |||||
self._tokenizer = Tokenizer() | self._tokenizer = Tokenizer() | ||||
self._builder = Builder() | self._builder = Builder() | ||||
@@ -89,6 +89,7 @@ Local (stack-specific) contexts: | |||||
* :const:`FAIL_ON_LBRACE` | * :const:`FAIL_ON_LBRACE` | ||||
* :const:`FAIL_ON_RBRACE` | * :const:`FAIL_ON_RBRACE` | ||||
* :const:`FAIL_ON_EQUALS` | * :const:`FAIL_ON_EQUALS` | ||||
* :const:`HAS_TEMPLATE` | |||||
* :const:`TABLE` | * :const:`TABLE` | ||||
@@ -161,15 +162,16 @@ FAIL_NEXT = 1 << 26 | |||||
FAIL_ON_LBRACE = 1 << 27 | FAIL_ON_LBRACE = 1 << 27 | ||||
FAIL_ON_RBRACE = 1 << 28 | FAIL_ON_RBRACE = 1 << 28 | ||||
FAIL_ON_EQUALS = 1 << 29 | FAIL_ON_EQUALS = 1 << 29 | ||||
HAS_TEMPLATE = 1 << 30 | |||||
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | ||||
FAIL_ON_RBRACE + FAIL_ON_EQUALS) | |||||
TABLE_OPEN = 1 << 30 | |||||
TABLE_CELL_OPEN = 1 << 31 | |||||
TABLE_CELL_STYLE = 1 << 32 | |||||
TABLE_ROW_OPEN = 1 << 33 | |||||
TABLE_TD_LINE = 1 << 34 | |||||
TABLE_TH_LINE = 1 << 35 | |||||
FAIL_ON_RBRACE + FAIL_ON_EQUALS + HAS_TEMPLATE) | |||||
TABLE_OPEN = 1 << 31 | |||||
TABLE_CELL_OPEN = 1 << 32 | |||||
TABLE_CELL_STYLE = 1 << 33 | |||||
TABLE_ROW_OPEN = 1 << 34 | |||||
TABLE_TD_LINE = 1 << 35 | |||||
TABLE_TH_LINE = 1 << 36 | |||||
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE | TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE | ||||
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + | TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + | ||||
TABLE_TD_LINE + TABLE_TH_LINE) | TABLE_TD_LINE + TABLE_TH_LINE) | ||||
@@ -63,22 +63,23 @@ SOFTWARE. | |||||
#define LC_DLTERM 0x0000000000800000 | #define LC_DLTERM 0x0000000000800000 | ||||
#define LC_SAFETY_CHECK 0x000000003F000000 | |||||
#define LC_SAFETY_CHECK 0x000000007F000000 | |||||
#define LC_HAS_TEXT 0x0000000001000000 | #define LC_HAS_TEXT 0x0000000001000000 | ||||
#define LC_FAIL_ON_TEXT 0x0000000002000000 | #define LC_FAIL_ON_TEXT 0x0000000002000000 | ||||
#define LC_FAIL_NEXT 0x0000000004000000 | #define LC_FAIL_NEXT 0x0000000004000000 | ||||
#define LC_FAIL_ON_LBRACE 0x0000000008000000 | #define LC_FAIL_ON_LBRACE 0x0000000008000000 | ||||
#define LC_FAIL_ON_RBRACE 0x0000000010000000 | #define LC_FAIL_ON_RBRACE 0x0000000010000000 | ||||
#define LC_FAIL_ON_EQUALS 0x0000000020000000 | #define LC_FAIL_ON_EQUALS 0x0000000020000000 | ||||
#define LC_TABLE 0x0000000FC0000000 | |||||
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000 | |||||
#define LC_TABLE_OPEN 0x0000000040000000 | |||||
#define LC_TABLE_CELL_OPEN 0x0000000080000000 | |||||
#define LC_TABLE_CELL_STYLE 0x0000000100000000 | |||||
#define LC_TABLE_ROW_OPEN 0x0000000200000000 | |||||
#define LC_TABLE_TD_LINE 0x0000000400000000 | |||||
#define LC_TABLE_TH_LINE 0x0000000800000000 | |||||
#define LC_HAS_TEMPLATE 0x0000000040000000 | |||||
#define LC_TABLE 0x0000001F80000000 | |||||
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000 | |||||
#define LC_TABLE_OPEN 0x0000000080000000 | |||||
#define LC_TABLE_CELL_OPEN 0x0000000100000000 | |||||
#define LC_TABLE_CELL_STYLE 0x0000000200000000 | |||||
#define LC_TABLE_ROW_OPEN 0x0000000400000000 | |||||
#define LC_TABLE_TD_LINE 0x0000000800000000 | |||||
#define LC_TABLE_TH_LINE 0x0000001000000000 | |||||
/* Global contexts */ | /* Global contexts */ | ||||
@@ -121,12 +121,16 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr) | |||||
/* | /* | ||||
Parse a template at the head of the wikicode string. | Parse a template at the head of the wikicode string. | ||||
*/ | */ | ||||
static int Tokenizer_parse_template(Tokenizer* self) | |||||
static int Tokenizer_parse_template(Tokenizer* self, int has_content) | |||||
{ | { | ||||
PyObject *template; | PyObject *template; | ||||
Py_ssize_t reset = self->head; | Py_ssize_t reset = self->head; | ||||
uint64_t context = LC_TEMPLATE_NAME; | |||||
template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1); | |||||
if (has_content) | |||||
context |= LC_HAS_TEMPLATE; | |||||
template = Tokenizer_parse(self, context, 1); | |||||
if (BAD_ROUTE) { | if (BAD_ROUTE) { | ||||
self->head = reset; | self->head = reset; | ||||
return 0; | return 0; | ||||
@@ -182,6 +186,7 @@ static int Tokenizer_parse_argument(Tokenizer* self) | |||||
static int Tokenizer_parse_template_or_argument(Tokenizer* self) | static int Tokenizer_parse_template_or_argument(Tokenizer* self) | ||||
{ | { | ||||
unsigned int braces = 2, i; | unsigned int braces = 2, i; | ||||
int has_content = 0; | |||||
PyObject *tokenlist; | PyObject *tokenlist; | ||||
self->head += 2; | self->head += 2; | ||||
@@ -198,7 +203,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) | |||||
return 0; | return 0; | ||||
} | } | ||||
if (braces == 2) { | if (braces == 2) { | ||||
if (Tokenizer_parse_template(self)) | |||||
if (Tokenizer_parse_template(self, has_content)) | |||||
return -1; | return -1; | ||||
if (BAD_ROUTE) { | if (BAD_ROUTE) { | ||||
RESET_ROUTE(); | RESET_ROUTE(); | ||||
@@ -212,7 +217,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) | |||||
return -1; | return -1; | ||||
if (BAD_ROUTE) { | if (BAD_ROUTE) { | ||||
RESET_ROUTE(); | RESET_ROUTE(); | ||||
if (Tokenizer_parse_template(self)) | |||||
if (Tokenizer_parse_template(self, has_content)) | |||||
return -1; | return -1; | ||||
if (BAD_ROUTE) { | if (BAD_ROUTE) { | ||||
char text[MAX_BRACES + 1]; | char text[MAX_BRACES + 1]; | ||||
@@ -228,8 +233,10 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) | |||||
} | } | ||||
else | else | ||||
braces -= 3; | braces -= 3; | ||||
if (braces) | |||||
if (braces) { | |||||
has_content = 1; | |||||
self->head++; | self->head++; | ||||
} | |||||
} | } | ||||
tokenlist = Tokenizer_pop(self); | tokenlist = Tokenizer_pop(self); | ||||
if (!tokenlist) | if (!tokenlist) | ||||
@@ -251,8 +258,13 @@ static int Tokenizer_handle_template_param(Tokenizer* self) | |||||
{ | { | ||||
PyObject *stack; | PyObject *stack; | ||||
if (self->topstack->context & LC_TEMPLATE_NAME) | |||||
if (self->topstack->context & LC_TEMPLATE_NAME) { | |||||
if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE))) { | |||||
Tokenizer_fail_route(self); | |||||
return -1; | |||||
} | |||||
self->topstack->context ^= LC_TEMPLATE_NAME; | self->topstack->context ^= LC_TEMPLATE_NAME; | ||||
} | |||||
else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE) | else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE) | ||||
self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE; | self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE; | ||||
if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { | if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { | ||||
@@ -303,7 +315,11 @@ static PyObject* Tokenizer_handle_template_end(Tokenizer* self) | |||||
{ | { | ||||
PyObject* stack; | PyObject* stack; | ||||
if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { | |||||
if (self->topstack->context & LC_TEMPLATE_NAME) { | |||||
if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE))) | |||||
return Tokenizer_fail_route(self); | |||||
} | |||||
else if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { | |||||
stack = Tokenizer_pop_keeping_context(self); | stack = Tokenizer_pop_keeping_context(self); | ||||
if (!stack) | if (!stack) | ||||
return NULL; | return NULL; | ||||
@@ -2428,30 +2444,26 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) | |||||
if (context & LC_TAG_CLOSE) | if (context & LC_TAG_CLOSE) | ||||
return (data == '<') ? -1 : 0; | return (data == '<') ? -1 : 0; | ||||
if (context & LC_TEMPLATE_NAME) { | if (context & LC_TEMPLATE_NAME) { | ||||
if (data == '{' || data == '}' || data == '[') { | |||||
if (data == '{') { | |||||
self->topstack->context |= LC_HAS_TEMPLATE | LC_FAIL_NEXT; | |||||
return 0; | |||||
} | |||||
if (data == '}' || (data == '<' && Tokenizer_READ(self, 1) == '!')) { | |||||
self->topstack->context |= LC_FAIL_NEXT; | self->topstack->context |= LC_FAIL_NEXT; | ||||
return 0; | return 0; | ||||
} | } | ||||
if (data == ']' || data == '>' || (data == '<' && | |||||
Tokenizer_READ(self, 1) != '!')) { | |||||
if (data == '[' || data == ']' || data == '<' || data == '>') { | |||||
return -1; | return -1; | ||||
} | } | ||||
if (data == '|') | if (data == '|') | ||||
return 0; | return 0; | ||||
if (context & LC_HAS_TEXT) { | if (context & LC_HAS_TEXT) { | ||||
if (context & LC_FAIL_ON_TEXT) { | if (context & LC_FAIL_ON_TEXT) { | ||||
if (!Py_UNICODE_ISSPACE(data)) { | |||||
if (data == '<' && Tokenizer_READ(self, 1) == '!') { | |||||
self->topstack->context |= LC_FAIL_NEXT; | |||||
return 0; | |||||
} | |||||
if (!Py_UNICODE_ISSPACE(data)) | |||||
return -1; | return -1; | ||||
} | |||||
} | |||||
else { | |||||
if (data == '\n') | |||||
self->topstack->context |= LC_FAIL_ON_TEXT; | |||||
} | } | ||||
else if (data == '\n') | |||||
self->topstack->context |= LC_FAIL_ON_TEXT; | |||||
} | } | ||||
else if (!Py_UNICODE_ISSPACE(data)) | else if (!Py_UNICODE_ISSPACE(data)) | ||||
self->topstack->context |= LC_HAS_TEXT; | self->topstack->context |= LC_HAS_TEXT; | ||||
@@ -192,11 +192,14 @@ class Tokenizer(object): | |||||
self._fail_route() | self._fail_route() | ||||
return self.END | return self.END | ||||
def _parse_template(self): | |||||
def _parse_template(self, has_content): | |||||
"""Parse a template at the head of the wikicode string.""" | """Parse a template at the head of the wikicode string.""" | ||||
reset = self._head | reset = self._head | ||||
context = contexts.TEMPLATE_NAME | |||||
if has_content: | |||||
context |= contexts.HAS_TEMPLATE | |||||
try: | try: | ||||
template = self._parse(contexts.TEMPLATE_NAME) | |||||
template = self._parse(context) | |||||
except BadRoute: | except BadRoute: | ||||
self._head = reset | self._head = reset | ||||
raise | raise | ||||
@@ -223,6 +226,7 @@ class Tokenizer(object): | |||||
while self._read() == "{": | while self._read() == "{": | ||||
self._head += 1 | self._head += 1 | ||||
braces += 1 | braces += 1 | ||||
has_content = False | |||||
self._push() | self._push() | ||||
while braces: | while braces: | ||||
@@ -230,7 +234,7 @@ class Tokenizer(object): | |||||
return self._emit_text_then_stack("{") | return self._emit_text_then_stack("{") | ||||
if braces == 2: | if braces == 2: | ||||
try: | try: | ||||
self._parse_template() | |||||
self._parse_template(has_content) | |||||
except BadRoute: | except BadRoute: | ||||
return self._emit_text_then_stack("{{") | return self._emit_text_then_stack("{{") | ||||
break | break | ||||
@@ -239,11 +243,12 @@ class Tokenizer(object): | |||||
braces -= 3 | braces -= 3 | ||||
except BadRoute: | except BadRoute: | ||||
try: | try: | ||||
self._parse_template() | |||||
self._parse_template(has_content) | |||||
braces -= 2 | braces -= 2 | ||||
except BadRoute: | except BadRoute: | ||||
return self._emit_text_then_stack("{" * braces) | return self._emit_text_then_stack("{" * braces) | ||||
if braces: | if braces: | ||||
has_content = True | |||||
self._head += 1 | self._head += 1 | ||||
self._emit_all(self._pop()) | self._emit_all(self._pop()) | ||||
@@ -253,6 +258,8 @@ class Tokenizer(object): | |||||
def _handle_template_param(self): | def _handle_template_param(self): | ||||
"""Handle a template parameter at the head of the string.""" | """Handle a template parameter at the head of the string.""" | ||||
if self._context & contexts.TEMPLATE_NAME: | if self._context & contexts.TEMPLATE_NAME: | ||||
if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE): | |||||
self._fail_route() | |||||
self._context ^= contexts.TEMPLATE_NAME | self._context ^= contexts.TEMPLATE_NAME | ||||
elif self._context & contexts.TEMPLATE_PARAM_VALUE: | elif self._context & contexts.TEMPLATE_PARAM_VALUE: | ||||
self._context ^= contexts.TEMPLATE_PARAM_VALUE | self._context ^= contexts.TEMPLATE_PARAM_VALUE | ||||
@@ -271,7 +278,10 @@ class Tokenizer(object): | |||||
def _handle_template_end(self): | def _handle_template_end(self): | ||||
"""Handle the end of a template at the head of the string.""" | """Handle the end of a template at the head of the string.""" | ||||
if self._context & contexts.TEMPLATE_PARAM_KEY: | |||||
if self._context & contexts.TEMPLATE_NAME: | |||||
if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE): | |||||
self._fail_route() | |||||
elif self._context & contexts.TEMPLATE_PARAM_KEY: | |||||
self._emit_all(self._pop(keep_context=True)) | self._emit_all(self._pop(keep_context=True)) | ||||
self._head += 1 | self._head += 1 | ||||
return self._pop() | return self._pop() | ||||
@@ -1183,23 +1193,22 @@ class Tokenizer(object): | |||||
elif context & contexts.EXT_LINK_TITLE: | elif context & contexts.EXT_LINK_TITLE: | ||||
return this != "\n" | return this != "\n" | ||||
elif context & contexts.TEMPLATE_NAME: | elif context & contexts.TEMPLATE_NAME: | ||||
if this == "{" or this == "}" or this == "[": | |||||
if this == "{": | |||||
self._context |= contexts.HAS_TEMPLATE | contexts.FAIL_NEXT | |||||
return True | |||||
if this == "}" or (this == "<" and self._read(1) == "!"): | |||||
self._context |= contexts.FAIL_NEXT | self._context |= contexts.FAIL_NEXT | ||||
return True | return True | ||||
if this == "]" or this == ">" or (this == "<" and self._read(1) != "!"): | |||||
if this == "[" or this == "]" or this == "<" or this == ">": | |||||
return False | return False | ||||
if this == "|": | if this == "|": | ||||
return True | return True | ||||
if context & contexts.HAS_TEXT: | if context & contexts.HAS_TEXT: | ||||
if context & contexts.FAIL_ON_TEXT: | if context & contexts.FAIL_ON_TEXT: | ||||
if this is self.END or not this.isspace(): | if this is self.END or not this.isspace(): | ||||
if this == "<" and self._read(1) == "!": | |||||
self._context |= contexts.FAIL_NEXT | |||||
return True | |||||
return False | return False | ||||
else: | |||||
if this == "\n": | |||||
self._context |= contexts.FAIL_ON_TEXT | |||||
elif this == "\n": | |||||
self._context |= contexts.FAIL_ON_TEXT | |||||
elif this is self.END or not this.isspace(): | elif this is self.END or not this.isspace(): | ||||
self._context |= contexts.HAS_TEXT | self._context |= contexts.HAS_TEXT | ||||
return True | return True | ||||
@@ -115,8 +115,8 @@ class TestDocs(unittest.TestCase): | |||||
@unittest.skipIf("NOWEB" in os.environ, "web test disabled by environ var") | @unittest.skipIf("NOWEB" in os.environ, "web test disabled by environ var") | ||||
def test_readme_5(self): | def test_readme_5(self): | ||||
"""test a block of example code in the README; includes a web call""" | """test a block of example code in the README; includes a web call""" | ||||
url1 = "http://en.wikipedia.org/w/api.php" | |||||
url2 = "http://en.wikipedia.org/w/index.php?title={0}&action=raw" | |||||
url1 = "https://en.wikipedia.org/w/api.php" | |||||
url2 = "https://en.wikipedia.org/w/index.php?title={0}&action=raw" | |||||
title = "Test" | title = "Test" | ||||
data = {"action": "query", "prop": "revisions", "rvlimit": 1, | data = {"action": "query", "prop": "revisions", "rvlimit": 1, | ||||
"rvprop": "content", "format": "json", "titles": title} | "rvprop": "content", "format": "json", "titles": title} | ||||
@@ -213,6 +213,9 @@ class TestTemplate(TreeEqualityTestCase): | |||||
pgens("f", "g")]) | pgens("f", "g")]) | ||||
node37 = Template(wraptext("a"), [pgenh("1", "")]) | node37 = Template(wraptext("a"), [pgenh("1", "")]) | ||||
node38 = Template(wraptext("abc")) | node38 = Template(wraptext("abc")) | ||||
node39 = Template(wraptext("a"), [pgenh("1", " b ")]) | |||||
node40 = Template(wraptext("a"), [pgenh("1", " b"), pgenh("2", " c")]) | |||||
node41 = Template(wraptext("a"), [pgens("1", " b"), pgens("2", " c")]) | |||||
node1.add("e", "f", showkey=True) | node1.add("e", "f", showkey=True) | ||||
node2.add(2, "g", showkey=False) | node2.add(2, "g", showkey=False) | ||||
@@ -255,6 +258,9 @@ class TestTemplate(TreeEqualityTestCase): | |||||
node37.add(1, "b") | node37.add(1, "b") | ||||
node38.add("1", "foo") | node38.add("1", "foo") | ||||
self.assertRaises(ValueError, node38.add, "z", "bar", showkey=False) | self.assertRaises(ValueError, node38.add, "z", "bar", showkey=False) | ||||
node39.add("1", "c") | |||||
node40.add("3", "d") | |||||
node41.add("3", "d") | |||||
self.assertEqual("{{a|b=c|d|e=f}}", node1) | self.assertEqual("{{a|b=c|d|e=f}}", node1) | ||||
self.assertEqual("{{a|b=c|d|g}}", node2) | self.assertEqual("{{a|b=c|d|g}}", node2) | ||||
@@ -299,6 +305,9 @@ class TestTemplate(TreeEqualityTestCase): | |||||
self.assertEqual("{{a|b=c|d=h|f=g}}", node36) | self.assertEqual("{{a|b=c|d=h|f=g}}", node36) | ||||
self.assertEqual("{{a|b}}", node37) | self.assertEqual("{{a|b}}", node37) | ||||
self.assertEqual("{{abc|foo}}", node38) | self.assertEqual("{{abc|foo}}", node38) | ||||
self.assertEqual("{{a|c}}", node39) | |||||
self.assertEqual("{{a| b| c|d}}", node40) | |||||
self.assertEqual("{{a|1= b|2= c|3= d}}", node41) | |||||
def test_remove(self): | def test_remove(self): | ||||
"""test Template.remove()""" | """test Template.remove()""" | ||||
@@ -395,13 +404,13 @@ class TestTemplate(TreeEqualityTestCase): | |||||
self.assertRaises(ValueError, node2.remove, "1") | self.assertRaises(ValueError, node2.remove, "1") | ||||
self.assertEqual("{{foo}}", node2) | self.assertEqual("{{foo}}", node2) | ||||
self.assertEqual("{{foo||abc=}}", node3) | self.assertEqual("{{foo||abc=}}", node3) | ||||
self.assertEqual("{{foo||baz}}", node4) | |||||
self.assertEqual("{{foo|2=baz}}", node4) | |||||
self.assertEqual("{{foo|b=c}}", node5) | self.assertEqual("{{foo|b=c}}", node5) | ||||
self.assertEqual("{{foo| a=|b=c}}", node6) | self.assertEqual("{{foo| a=|b=c}}", node6) | ||||
self.assertEqual("{{foo|1 =|2=c}}", node7) | self.assertEqual("{{foo|1 =|2=c}}", node7) | ||||
self.assertEqual("{{foo|2=c}}", node8) | self.assertEqual("{{foo|2=c}}", node8) | ||||
self.assertEqual("{{foo||c}}", node9) | self.assertEqual("{{foo||c}}", node9) | ||||
self.assertEqual("{{foo||c}}", node10) | |||||
self.assertEqual("{{foo|2=c}}", node10) | |||||
self.assertEqual("{{foo|b=c|a =d}}", node11) | self.assertEqual("{{foo|b=c|a =d}}", node11) | ||||
self.assertEqual("{{foo| a=|b=c|a =d}}", node12) | self.assertEqual("{{foo| a=|b=c|a =d}}", node12) | ||||
self.assertEqual("{{foo| a=b|a =d}}", node13) | self.assertEqual("{{foo| a=b|a =d}}", node13) | ||||
@@ -410,7 +419,7 @@ class TestTemplate(TreeEqualityTestCase): | |||||
self.assertEqual("{{foo| a=b|b=c|a =}}", node16) | self.assertEqual("{{foo| a=b|b=c|a =}}", node16) | ||||
self.assertEqual("{{foo|b|c}}", node17) | self.assertEqual("{{foo|b|c}}", node17) | ||||
self.assertEqual("{{foo|1 =|b|c}}", node18) | self.assertEqual("{{foo|1 =|b|c}}", node18) | ||||
self.assertEqual("{{foo|1 =a||c}}", node19) | |||||
self.assertEqual("{{foo|1 =a|2=c}}", node19) | |||||
self.assertEqual("{{foo|1 =a||c}}", node20) | self.assertEqual("{{foo|1 =a||c}}", node20) | ||||
self.assertEqual("{{foo|c=d|e=f}}", node21) | self.assertEqual("{{foo|c=d|e=f}}", node21) | ||||
self.assertEqual("{{foo|a=|c=d|e=f}}", node22) | self.assertEqual("{{foo|a=|c=d|e=f}}", node22) | ||||
@@ -244,6 +244,13 @@ output: [Text(text="{{foobar\n<!|key=value}}")] | |||||
--- | --- | ||||
name: newline_and_comment_in_template_name_8 | |||||
label: a template name containing a newline followed by a comment | |||||
input: "{{<!-- comment -->\nfoobar\n<!-- comment -->}}" | |||||
output: [TemplateOpen(), CommentStart(), Text(text=" comment "), CommentEnd(), Text(text="\nfoobar\n"), CommentStart(), Text(text=" comment "), CommentEnd(), TemplateClose()] | |||||
--- | |||||
name: tag_in_link_title | name: tag_in_link_title | ||||
label: HTML tags are invalid in link titles, even when complete | label: HTML tags are invalid in link titles, even when complete | ||||
input: "[[foo<i>bar</i>baz]]" | input: "[[foo<i>bar</i>baz]]" | ||||
@@ -1,17 +1,3 @@ | |||||
name: blank | |||||
label: template with no content | |||||
input: "{{}}" | |||||
output: [TemplateOpen(), TemplateClose()] | |||||
name: blank_with_params | |||||
label: template with no content, but pipes and equal signs | |||||
input: "{{||=|}}" | |||||
output: [TemplateOpen(), TemplateParamSeparator(), TemplateParamSeparator(), TemplateParamEquals(), TemplateParamSeparator(), TemplateClose()] | |||||
name: no_params | name: no_params | ||||
label: simplest type of template | label: simplest type of template | ||||
input: "{{template}}" | input: "{{template}}" | ||||
@@ -61,6 +47,13 @@ output: [TemplateOpen(), Text(text="foo"), TemplateParamSeparator(), Text(text=" | |||||
--- | --- | ||||
name: blank_params | |||||
label: template with blank parameters (mix of pipes and equal signs) | |||||
input: "{{,||=|}}" | |||||
output: [TemplateOpen(), Text(text=","), TemplateParamSeparator(), TemplateParamSeparator(), TemplateParamEquals(), TemplateParamSeparator(), TemplateClose()] | |||||
--- | |||||
name: nested_unnamed_param | name: nested_unnamed_param | ||||
label: nested template as an unnamed parameter | label: nested template as an unnamed parameter | ||||
input: "{{foo|{{bar}}}}" | input: "{{foo|{{bar}}}}" | ||||
@@ -390,6 +383,34 @@ output: [TemplateOpen(), Text(text="foo\n "), TemplateParamSeparator(), Text(te | |||||
--- | --- | ||||
name: invalid_blank | |||||
label: invalid template with no content | |||||
input: "{{}}" | |||||
output: [Text(text="{{}}")] | |||||
--- | |||||
name: invalid_blank_whitespace | |||||
label: invalid template with no content, but whitespace | |||||
input: "{{ }}" | |||||
output: [Text(text="{{ }}")] | |||||
--- | |||||
name: invalid_blank_pipe | |||||
label: invalid template with no content, but a parameter | |||||
input: "{{|foo}}" | |||||
output: [Text(text="{{|foo}}")] | |||||
--- | |||||
name: invalid_blank_whitespace_pipe | |||||
label: invalid template with no content, but whitespace and a parameter | |||||
input: "{{ |foo}}" | |||||
output: [Text(text="{{ |foo}}")] | |||||
--- | |||||
name: invalid_name_left_brace_middle | name: invalid_name_left_brace_middle | ||||
label: invalid characters in template name: left brace in middle | label: invalid characters in template name: left brace in middle | ||||
input: "{{foo{bar}}" | input: "{{foo{bar}}" | ||||
@@ -665,5 +686,5 @@ output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ | |||||
name: recursion_opens_and_closes | name: recursion_opens_and_closes | ||||
label: test potentially dangerous recursion: template openings and closings | label: test potentially dangerous recursion: template openings and closings | ||||
input: "{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}" | |||||
output: [Text(text="{{|"), TemplateOpen(), TemplateClose(), Text(text="{{|"), TemplateOpen(), TemplateClose(), TemplateOpen(), TemplateParamSeparator(), TemplateOpen(), TemplateClose(), Text(text="{{"), TemplateParamSeparator(), Text(text="{{"), TemplateClose(), Text(text="{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}")] | |||||
input: "{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}" | |||||
output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), TemplateOpen(), Text(text="x"), TemplateParamSeparator(), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x"), TemplateParamSeparator(), Text(text="{{x"), TemplateClose(), Text(text="{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}")] |