Browse Source

Port the fix for #197 to the C tokenizer

pull/232/head
Jakub Klinkovský 5 years ago
parent
commit
df3a9054cb
1 changed files with 67 additions and 1 deletions
  1. +67
    -1
      mwparserfromhell/parser/ctokenizer/tok_parse.c

+ 67
- 1
mwparserfromhell/parser/ctokenizer/tok_parse.c View File

@@ -100,6 +100,66 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr)
}

/*
Check if the given character is a non-word character.

Equivalent to this Python code:

def is_non_word_character(ch):
if re.fullmatch(r"\W", chunk):
return True
return False
*/
static int is_non_word_character(Py_UCS4 ch)
{
int ret = 0;
PyObject* modname = NULL;
PyObject* module = NULL;
PyObject* fmatch = NULL;
PyObject* pattern = NULL;
PyObject* str = NULL;
PyObject* posArgs = NULL;
PyObject* match = NULL;

modname = PyUnicode_FromString("re");
if (modname == NULL)
goto error;
module = PyImport_Import(modname);
if (module == NULL)
goto error;
fmatch = PyObject_GetAttrString(module, "fullmatch");
if (fmatch == NULL)
goto error;
pattern = PyUnicode_FromString("\\W");
if (pattern == NULL)
goto error;
str = PyUnicode_FROM_SINGLE(ch);
if (str == NULL)
goto error;
posArgs = PyTuple_Pack(2, pattern, str);
if (posArgs == NULL)
goto error;
match = PyObject_Call(fmatch, posArgs, NULL);
if (match == NULL)
goto error;

if (match != Py_None)
ret = 1;
goto end;

error:
ret = -1;
end:
Py_XDECREF(match);
Py_XDECREF(posArgs);
Py_XDECREF(str);
Py_XDECREF(pattern);
Py_XDECREF(fmatch);
Py_XDECREF(module);
Py_XDECREF(modname);
return ret;
}

/*
Parse a template at the head of the wikicode string.
*/
static int Tokenizer_parse_template(Tokenizer* self, int has_content)
@@ -527,7 +587,13 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
// it was just parsed as text:
for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) {
chunk = Textbuffer_read(self->topstack->textbuffer, i);
if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
// stop at the first non-word character
int is_non_word = is_non_word_character(chunk);
if (is_non_word < 0) {
Textbuffer_dealloc(scheme_buffer);
return -1;
}
else if (is_non_word == 1)
goto end_of_loop;
j = 0;
do {


Loading…
Cancel
Save