From 38050f687845741daef97938fb3af39f03e76708 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 15 Oct 2013 19:07:07 -0400 Subject: [PATCH] C code cleanup and speed improvements. --- CHANGELOG | 1 + docs/changelog.rst | 1 + mwparserfromhell/parser/tokenizer.c | 255 ++++++++++++++++++------------------ mwparserfromhell/parser/tokenizer.h | 8 +- 4 files changed, 132 insertions(+), 133 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index a22463a..a00f8f3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,7 @@ v0.4 (unreleased): (taking one argument, a Node, and returning a bool) in addition to a regex. - Wikicode.matches() now accepts a tuple of strings/Wikicode objects instead of just a single string or Wikicode. +- C code cleanup and speed improvements. v0.3.2 (released September 1, 2013): diff --git a/docs/changelog.rst b/docs/changelog.rst index 7d9ced7..6708f0f 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -17,6 +17,7 @@ Unreleased - :py:meth:`.Wikicode.matches` now accepts a tuple of strings/:py:class:`.Wikicode` objects instead of just a single string or :py:class:`.Wikicode`. +- C code cleanup and speed improvements. v0.3.2 ------ diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 609a595..1823006 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -31,7 +31,7 @@ static int is_marker(Py_UNICODE this) int i; for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == this) + if (MARKERS[i] == this) return 1; } return 0; @@ -642,7 +642,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) PyObject *tokenlist; self->head += 2; - while (Tokenizer_READ(self, 0) == *"{" && braces < MAX_BRACES) { + while (Tokenizer_READ(self, 0) == '{' && braces < MAX_BRACES) { self->head++; braces++; } @@ -674,8 +674,8 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) if (BAD_ROUTE) { char text[MAX_BRACES + 1]; RESET_ROUTE(); - for (i = 0; i < braces; i++) text[i] = *"{"; - text[braces] = *""; + for (i = 0; i < braces; i++) text[i] = '{'; + text[braces] = '\0'; if (Tokenizer_emit_text_then_stack(self, text)) { Py_XDECREF(text); return -1; @@ -872,7 +872,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) if (Tokenizer_push(self, LC_EXT_LINK_URI)) return -1; - if (Tokenizer_READ(self, 0) == *"/" && Tokenizer_READ(self, 1) == *"/") { + if (Tokenizer_READ(self, 0) == '/' && Tokenizer_READ(self, 1) == '/') { if (Tokenizer_emit_text(self, "//")) return -1; self->head += 2; @@ -881,7 +881,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) buffer = Textbuffer_new(); if (!buffer) return -1; - while ((this = Tokenizer_READ(self, 0)) != *"") { + while ((this = Tokenizer_READ(self, 0))) { i = 0; while (1) { if (!valid[i]) @@ -898,18 +898,18 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) self->head++; } end_of_loop: - if (this != *":") { + if (this != ':') { Textbuffer_dealloc(buffer); Tokenizer_fail_route(self); return 0; } - if (Tokenizer_emit_char(self, *":")) { + if (Tokenizer_emit_char(self, ':')) { Textbuffer_dealloc(buffer); return -1; } self->head++; - slashes = (Tokenizer_READ(self, 0) == *"/" && - Tokenizer_READ(self, 1) == *"/"); + slashes = (Tokenizer_READ(self, 0) == '/' && + Tokenizer_READ(self, 1) == '/'); if (slashes) { if (Tokenizer_emit_text(self, "//")) { Textbuffer_dealloc(buffer); @@ -973,8 +973,8 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) Textbuffer_dealloc(scheme_buffer); return -1; } - slashes = (Tokenizer_READ(self, 0) == *"/" && - Tokenizer_READ(self, 1) == *"/"); + slashes = (Tokenizer_READ(self, 0) == '/' && + Tokenizer_READ(self, 1) == '/'); if (!IS_SCHEME(scheme, slashes, 1)) { Py_DECREF(scheme); Textbuffer_dealloc(scheme_buffer); @@ -988,7 +988,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) } if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1)) return -1; - if (Tokenizer_emit_char(self, *":")) + if (Tokenizer_emit_char(self, ':')) return -1; if (slashes) { if (Tokenizer_emit_text(self, "//")) @@ -1014,13 +1014,13 @@ Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, return error; \ } - if (this == *"(" && !(*parens)) { + if (this == '(' && !(*parens)) { *parens = 1; PUSH_TAIL_BUFFER(*tail, -1) } - else if (this == *"," || this == *";" || this == *"\\" || this == *"." || - this == *":" || this == *"!" || this == *"?" || - (!(*parens) && this == *")")) + else if (this == ',' || this == ';' || this == '\\' || this == '.' || + this == ':' || this == '!' || this == '?' || + (!(*parens) && this == ')')) return Textbuffer_write(tail, this); else PUSH_TAIL_BUFFER(*tail, -1) @@ -1037,12 +1037,12 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) Py_UNICODE after = Tokenizer_READ(self, 2); int ctx = self->topstack->context; - return (this == *"" || this == *"\n" || this == *"[" || this == *"]" || - this == *"<" || this == *">" || (this == *"'" && next == *"'") || - (this == *"|" && ctx & LC_TEMPLATE) || - (this == *"=" && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || - (this == *"}" && next == *"}" && - (ctx & LC_TEMPLATE || (after == *"}" && ctx & LC_ARGUMENT)))); + return (!this || this == '\n' || this == '[' || this == ']' || + this == '<' || this == '>' || (this == '\'' && next == '\'') || + (this == '|' && ctx & LC_TEMPLATE) || + (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || + (this == '}' && next == '}' && + (ctx & LC_TEMPLATE || (after == '}' && ctx & LC_ARGUMENT)))); } /* @@ -1061,21 +1061,21 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, if (BAD_ROUTE) return NULL; this = Tokenizer_READ(self, 0); - if (this == *"" || this == *"\n" || this == *" " || this == *"]") + if (!this || this == '\n' || this == ' ' || this == ']') return Tokenizer_fail_route(self); - if (!brackets && this == *"[") + if (!brackets && this == '[') return Tokenizer_fail_route(self); while (1) { this = Tokenizer_READ(self, 0); next = Tokenizer_READ(self, 1); - if (this == *"&") { + if (this == '&') { PUSH_TAIL_BUFFER(*extra, NULL) if (Tokenizer_parse_entity(self)) return NULL; } - else if (this == *"<" && next == *"!" - && Tokenizer_READ(self, 2) == *"-" - && Tokenizer_READ(self, 3) == *"-") { + else if (this == '<' && next == '!' + && Tokenizer_READ(self, 2) == '-' + && Tokenizer_READ(self, 3) == '-') { PUSH_TAIL_BUFFER(*extra, NULL) if (Tokenizer_parse_comment(self)) return NULL; @@ -1084,16 +1084,16 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, self->head--; return Tokenizer_pop(self); } - else if (this == *"" || this == *"\n") + else if (!this || this == '\n') return Tokenizer_fail_route(self); - else if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) { + else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) { PUSH_TAIL_BUFFER(*extra, NULL) if (Tokenizer_parse_template_or_argument(self)) return NULL; } - else if (this == *"]") + else if (this == ']') return Tokenizer_pop(self); - else if (this == *" ") { + else if (this == ' ') { if (brackets) { if (Tokenizer_emit(self, ExternalLinkSeparator)) return NULL; @@ -1102,7 +1102,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, self->head++; return Tokenizer_parse(self, 0, 0); } - if (Textbuffer_write(extra, *" ")) + if (Textbuffer_write(extra, ' ')) return NULL; return Tokenizer_pop(self); } @@ -1232,7 +1232,7 @@ static int Tokenizer_parse_heading(Tokenizer* self) self->global |= GL_HEADING; self->head += 1; - while (Tokenizer_READ(self, 0) == *"=") { + while (Tokenizer_READ(self, 0) == '=') { best++; self->head++; } @@ -1242,7 +1242,7 @@ static int Tokenizer_parse_heading(Tokenizer* self) RESET_ROUTE(); self->head = reset + best - 1; for (i = 0; i < best; i++) { - if (Tokenizer_emit_char(self, *"=")) + if (Tokenizer_emit_char(self, '=')) return -1; } self->global ^= GL_HEADING; @@ -1271,7 +1271,7 @@ static int Tokenizer_parse_heading(Tokenizer* self) if (heading->level < best) { diff = best - heading->level; for (i = 0; i < diff; i++) { - if (Tokenizer_emit_char(self, *"=")) { + if (Tokenizer_emit_char(self, '=')) { Py_DECREF(heading->title); free(heading); return -1; @@ -1303,7 +1303,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) self->head += 1; best = 1; - while (Tokenizer_READ(self, 0) == *"=") { + while (Tokenizer_READ(self, 0) == '=') { best++; self->head++; } @@ -1316,7 +1316,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) if (level < best) { diff = best - level; for (i = 0; i < diff; i++) { - if (Tokenizer_emit_char(self, *"=")) + if (Tokenizer_emit_char(self, '=')) return NULL; } } @@ -1324,7 +1324,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) } else { for (i = 0; i < best; i++) { - if (Tokenizer_emit_char(self, *"=")) { + if (Tokenizer_emit_char(self, '=')) { Py_DECREF(after->title); free(after); return NULL; @@ -1372,21 +1372,21 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) return -1; self->head++; this = Tokenizer_READ(self, 0); - if (this == *"") { + if (!this) { Tokenizer_fail_route(self); return 0; } - if (this == *"#") { + if (this == '#') { numeric = 1; if (Tokenizer_emit(self, HTMLEntityNumeric)) return -1; self->head++; this = Tokenizer_READ(self, 0); - if (this == *"") { + if (!this) { Tokenizer_fail_route(self); return 0; } - if (this == *"x" || this == *"X") { + if (this == 'x' || this == 'X') { hexadecimal = 1; kwargs = PyDict_New(); if (!kwargs) @@ -1416,22 +1416,20 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) zeroes = 0; while (1) { this = Tokenizer_READ(self, 0); - if (this == *";") { + if (this == ';') { if (i == 0) FAIL_ROUTE_AND_EXIT() break; } - if (i == 0 && this == *"0") { + if (i == 0 && this == '0') { zeroes++; self->head++; continue; } if (i >= MAX_ENTITY_SIZE) FAIL_ROUTE_AND_EXIT() - for (j = 0; j < NUM_MARKERS; j++) { - if (this == *MARKERS[j]) - FAIL_ROUTE_AND_EXIT() - } + if (is_marker(this)) + FAIL_ROUTE_AND_EXIT() j = 0; while (1) { if (!valid[j]) @@ -1508,7 +1506,7 @@ static int Tokenizer_parse_entity(Tokenizer* self) if (BAD_ROUTE) { RESET_ROUTE(); self->head = reset; - if (Tokenizer_emit_char(self, *"&")) + if (Tokenizer_emit_char(self, '&')) return -1; return 0; } @@ -1537,14 +1535,14 @@ static int Tokenizer_parse_comment(Tokenizer* self) return -1; while (1) { this = Tokenizer_READ(self, 0); - if (this == *"") { + if (!this) { comment = Tokenizer_pop(self); Py_XDECREF(comment); self->head = reset; return Tokenizer_emit_text(self, "