Browse Source

A bit of misc cleanup.

tags/v0.2
Ben Kurtovic 11 years ago
parent
commit
054a84afe0
2 changed files with 26 additions and 18 deletions
  1. +22
    -18
      mwparserfromhell/parser/tokenizer.c
  2. +4
    -0
      mwparserfromhell/parser/tokenizer.h

+ 22
- 18
mwparserfromhell/parser/tokenizer.c View File

@@ -843,7 +843,8 @@ Tokenizer_handle_heading_end(Tokenizer* self)
self->head++; self->head++;
} }
current = log2(self->topstack->context / LC_HEADING_LEVEL_1) + 1; current = log2(self->topstack->context / LC_HEADING_LEVEL_1) + 1;
level = current > best ? (best > 6 ? 6 : best) : (current > 6 ? 6 : current);
level = current > best ? (best > 6 ? 6 : best) :
(current > 6 ? 6 : current);
after = (HeadingData*) Tokenizer_parse(self, self->topstack->context); after = (HeadingData*) Tokenizer_parse(self, self->topstack->context);
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
@@ -956,11 +957,11 @@ Tokenizer_really_parse_entity(Tokenizer* self)
else else
numeric = hexadecimal = 0; numeric = hexadecimal = 0;
if (hexadecimal) if (hexadecimal)
valid = "0123456789abcdefABCDEF";
valid = HEXDIGITS;
else if (numeric) else if (numeric)
valid = "0123456789";
valid = DIGITS;
else else
valid = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
valid = ALPHANUM;
text = calloc(MAX_ENTITY_SIZE, sizeof(char)); text = calloc(MAX_ENTITY_SIZE, sizeof(char));
if (!text) { if (!text) {
PyErr_NoMemory(); PyErr_NoMemory();
@@ -1005,7 +1006,7 @@ Tokenizer_really_parse_entity(Tokenizer* self)
i = 0; i = 0;
while (1) { while (1) {
def = entitydefs[i]; def = entitydefs[i];
if (!def) // We've reached the end of the def list without finding it
if (!def) // We've reached the end of the defs without finding it
FAIL_ROUTE_AND_EXIT() FAIL_ROUTE_AND_EXIT()
if (strcmp(text, def) == 0) if (strcmp(text, def) == 0)
break; break;
@@ -1161,7 +1162,6 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
} }
if (data == *"|") if (data == *"|")
return 0; return 0;

if (context & LC_HAS_TEXT) { if (context & LC_HAS_TEXT) {
if (context & LC_FAIL_ON_TEXT) { if (context & LC_FAIL_ON_TEXT) {
if (!Py_UNICODE_ISSPACE(data)) if (!Py_UNICODE_ISSPACE(data))
@@ -1182,7 +1182,8 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
} }
} }
else if (context & LC_FAIL_ON_LBRACE) { else if (context & LC_FAIL_ON_LBRACE) {
if (data == *"{" || (Tokenizer_READ(self, -1) == *"{" && Tokenizer_READ(self, -2) == *"{")) {
if (data == *"{" || (Tokenizer_READ(self, -1) == *"{" &&
Tokenizer_READ(self, -2) == *"{")) {
if (context & LC_TEMPLATE) if (context & LC_TEMPLATE)
self->topstack->context |= LC_FAIL_ON_EQUALS; self->topstack->context |= LC_FAIL_ON_EQUALS;
else else
@@ -1375,7 +1376,8 @@ Tokenizer_tokenize(Tokenizer* self, PyObject* args)
PyMODINIT_FUNC PyMODINIT_FUNC
init_tokenizer(void) init_tokenizer(void)
{ {
PyObject *module, *tempmodule, *defmap, *deflist, *globals, *locals, *fromlist, *modname;
PyObject *module, *tempmod, *defmap, *deflist, *globals, *locals,
*fromlist, *modname;
unsigned numdefs, i; unsigned numdefs, i;
char *name; char *name;


@@ -1386,13 +1388,13 @@ init_tokenizer(void)
Py_INCREF(&TokenizerType); Py_INCREF(&TokenizerType);
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);


tempmodule = PyImport_ImportModule("htmlentitydefs");
if (!tempmodule)
tempmod = PyImport_ImportModule("htmlentitydefs");
if (!tempmod)
return; return;
defmap = PyObject_GetAttrString(tempmodule, "entitydefs");
defmap = PyObject_GetAttrString(tempmod, "entitydefs");
if (!defmap) if (!defmap)
return; return;
Py_DECREF(tempmodule);
Py_DECREF(tempmod);
deflist = PyDict_Keys(defmap); deflist = PyDict_Keys(defmap);
if (!deflist) if (!deflist)
return; return;
@@ -1416,18 +1418,20 @@ init_tokenizer(void)
if (!modname) if (!modname)
return; return;
PyList_SET_ITEM(fromlist, 0, modname); PyList_SET_ITEM(fromlist, 0, modname);
tempmodule = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist); Py_DECREF(fromlist);
if (!tempmodule)
if (!tempmod)
return; return;
tokens = PyObject_GetAttrString(tempmodule, "tokens");
Py_DECREF(tempmodule);
tokens = PyObject_GetAttrString(tempmod, "tokens");
Py_DECREF(tempmod);


Text = PyObject_GetAttrString(tokens, "Text"); Text = PyObject_GetAttrString(tokens, "Text");


TemplateOpen = PyObject_GetAttrString(tokens, "TemplateOpen"); TemplateOpen = PyObject_GetAttrString(tokens, "TemplateOpen");
TemplateParamSeparator = PyObject_GetAttrString(tokens, "TemplateParamSeparator");
TemplateParamEquals = PyObject_GetAttrString(tokens, "TemplateParamEquals");
TemplateParamSeparator = PyObject_GetAttrString(tokens,
"TemplateParamSeparator");
TemplateParamEquals = PyObject_GetAttrString(tokens,
"TemplateParamEquals");
TemplateClose = PyObject_GetAttrString(tokens, "TemplateClose"); TemplateClose = PyObject_GetAttrString(tokens, "TemplateClose");


ArgumentOpen = PyObject_GetAttrString(tokens, "ArgumentOpen"); ArgumentOpen = PyObject_GetAttrString(tokens, "ArgumentOpen");


+ 4
- 0
mwparserfromhell/parser/tokenizer.h View File

@@ -36,6 +36,10 @@ SOFTWARE.
#define malloc PyObject_Malloc #define malloc PyObject_Malloc
#define free PyObject_Free #define free PyObject_Free


#define DIGITS "0123456789"
#define HEXDIGITS "0123456789abcdefABCDEF"
#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

static const char* MARKERS[] = { static const char* MARKERS[] = {
"{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-", "{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-",
"!", "\n", ""}; "!", "\n", ""};


Loading…
Cancel
Save