Browse Source

Merge branch 'feature/win_builds' into develop

tags/v0.4.1
Ben Kurtovic 8 years ago
parent
commit
d6d3b45cd8
29 changed files with 2129 additions and 1473 deletions
  1. +8
    -0
      CHANGELOG
  2. +64
    -0
      appveyor.yml
  3. +13
    -2
      docs/changelog.rst
  4. +0
    -2
      mwparserfromhell/compat.py
  5. +1
    -3
      mwparserfromhell/definitions.py
  6. +125
    -0
      mwparserfromhell/parser/ctokenizer/common.h
  7. +105
    -0
      mwparserfromhell/parser/ctokenizer/contexts.h
  8. +78
    -0
      mwparserfromhell/parser/ctokenizer/tag_data.c
  9. +43
    -0
      mwparserfromhell/parser/ctokenizer/tag_data.h
  10. +232
    -0
      mwparserfromhell/parser/ctokenizer/textbuffer.c
  11. +36
    -0
      mwparserfromhell/parser/ctokenizer/textbuffer.h
  12. +182
    -874
      mwparserfromhell/parser/ctokenizer/tok_parse.c
  13. +35
    -0
      mwparserfromhell/parser/ctokenizer/tok_parse.h
  14. +345
    -0
      mwparserfromhell/parser/ctokenizer/tok_support.c
  15. +62
    -0
      mwparserfromhell/parser/ctokenizer/tok_support.h
  16. +310
    -0
      mwparserfromhell/parser/ctokenizer/tokenizer.c
  17. +111
    -0
      mwparserfromhell/parser/ctokenizer/tokenizer.h
  18. +111
    -0
      mwparserfromhell/parser/ctokenizer/tokens.c
  19. +69
    -0
      mwparserfromhell/parser/ctokenizer/tokens.h
  20. +0
    -367
      mwparserfromhell/parser/tokenizer.h
  21. +42
    -58
      mwparserfromhell/smart_list.py
  22. +3
    -0
      scripts/README
  23. +11
    -11
      scripts/release.sh
  24. +0
    -58
      scripts/win_build.py
  25. +43
    -0
      scripts/win_wrapper.cmd
  26. +38
    -59
      setup.py
  27. +41
    -26
      tests/_test_tokenizer.py
  28. +20
    -12
      tests/test_smart_list.py
  29. +1
    -1
      tests/tokenizer/text.mwtest

+ 8
- 0
CHANGELOG View File

@@ -8,9 +8,17 @@ v0.4.1 (unreleased):
includes when denoting tags, but not comments.
- Fixed the behavior of preserve_spacing in Template.add() and keep_field in
Template.remove() on parameters with hidden keys.
- Removed _ListProxy.detach(). SmartLists now use weak references and their
children are garbage-collected properly.
- Fixed parser bugs involving:
- templates with completely blank names;
- templates with newlines and comments.
- Heavy refactoring and fixes to the C tokenizer, including:
- corrected a design flaw in text handling, allowing for substantial speed
improvements when parsing long strings of plain text;
- implemented new Python 3.3 PEP 393 Unicode APIs.
- Fixed various bugs in SmartList, including one that was causing memory issues
on 64-bit builds of Python 2 on Windows.
- Fixed some bugs in the release scripts.

v0.4 (released May 23, 2015):


+ 64
- 0
appveyor.yml View File

@@ -0,0 +1,64 @@
# This config file is used by appveyor.com to build Windows release binaries

version: 0.4.1.dev0-b{build}

branches:
only:
- master

skip_tags: true

environment:
global:
# See: http://stackoverflow.com/a/13751649/163740
WRAPPER: "cmd /E:ON /V:ON /C .\\scripts\\win_wrapper.cmd"
PIP: "%WRAPPER% %PYTHON%\\Scripts\\pip.exe"
SETUPPY: "%WRAPPER% %PYTHON%\\python setup.py --with-extension"
PYPI_USERNAME: "earwigbot"
PYPI_PASSWORD:
secure: gOIcvPxSC2ujuhwOzwj3v8xjq3CCYd8keFWVnguLM+gcL0e02qshDHy7gwZZwj0+

matrix:
- PYTHON: "C:\\Python27"
PYTHON_VERSION: "2.7"
PYTHON_ARCH: "32"

- PYTHON: "C:\\Python27-x64"
PYTHON_VERSION: "2.7"
PYTHON_ARCH: "64"

- PYTHON: "C:\\Python33"
PYTHON_VERSION: "3.3"
PYTHON_ARCH: "32"

- PYTHON: "C:\\Python33-x64"
PYTHON_VERSION: "3.3"
PYTHON_ARCH: "64"

- PYTHON: "C:\\Python34"
PYTHON_VERSION: "3.4"
PYTHON_ARCH: "32"

- PYTHON: "C:\\Python34-x64"
PYTHON_VERSION: "3.4"
PYTHON_ARCH: "64"

install:
- "%PIP% install wheel twine"

build_script:
- "%SETUPPY% build"

test_script:
- "%SETUPPY% -q test"

after_test:
- "%SETUPPY% bdist_wheel"

on_success:
- "twine upload dist\\* -u %PYPI_USERNAME% -p %PYPI_PASSWORD%"

artifacts:
- path: dist\*

deploy: off

+ 13
- 2
docs/changelog.rst View File

@@ -13,13 +13,24 @@ Unreleased
- Added support for Python 3.5.
- ``<`` and ``>`` are now disallowed in wikilink titles and template names.
This includes when denoting tags, but not comments.
- Fixed the behavior of *preserve_spacing* in :func:`~.Template.add` and
*keep_field* in :func:`~.Template.remove` on parameters with hidden keys.
- Fixed the behavior of *preserve_spacing* in :meth:`.Template.add` and
*keep_field* in :meth:`.Template.remove` on parameters with hidden keys.
- Removed :meth:`._ListProxy.detach`. :class:`.SmartList`\ s now use weak
references and their children are garbage-collected properly.
- Fixed parser bugs involving:

- templates with completely blank names;
- templates with newlines and comments.

- Heavy refactoring and fixes to the C tokenizer, including:

- corrected a design flaw in text handling, allowing for substantial speed
improvements when parsing long strings of plain text;
- implemented new Python 3.3
`PEP 393 <https://www.python.org/dev/peps/pep-0393/>`_ Unicode APIs.

- Fixed various bugs in :class:`.SmartList`, including one that was causing
memory issues on 64-bit builds of Python 2 on Windows.
- Fixed some bugs in the release scripts.

v0.4


+ 0
- 2
mwparserfromhell/compat.py View File

@@ -18,14 +18,12 @@ if py3k:
bytes = bytes
str = str
range = range
maxsize = sys.maxsize
import html.entities as htmlentities

else:
bytes = str
str = unicode
range = xrange
maxsize = sys.maxint
import htmlentitydefs as htmlentities

del sys

+ 1
- 3
mwparserfromhell/definitions.py View File

@@ -81,10 +81,8 @@ def is_single_only(tag):
"""Return whether or not the given *tag* must exist without a close tag."""
return tag.lower() in SINGLE_ONLY

def is_scheme(scheme, slashes=True, reverse=False):
def is_scheme(scheme, slashes=True):
"""Return whether *scheme* is valid for external links."""
if reverse: # Convenience for C
scheme = scheme[::-1]
scheme = scheme.lower()
if slashes:
return scheme in URI_SCHEMES


+ 125
- 0
mwparserfromhell/parser/ctokenizer/common.h View File

@@ -0,0 +1,125 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#pragma once

#ifndef PY_SSIZE_T_CLEAN
#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/2/c-api/arg.html
#endif

#include <Python.h>
#include <structmember.h>
#include <bytesobject.h>

/* Compatibility macros */

#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif

#ifndef uint64_t
#define uint64_t unsigned PY_LONG_LONG
#endif

#define malloc PyObject_Malloc // XXX: yuck
#define realloc PyObject_Realloc
#define free PyObject_Free

/* Unicode support macros */

#if defined(IS_PY3K) && PY_MINOR_VERSION >= 3
#define PEP_393
#endif

#ifdef PEP_393
#define Unicode Py_UCS4
#define PyUnicode_FROM_SINGLE(chr) \
PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1)
#else
#define Unicode Py_UNICODE
#define PyUnicode_FROM_SINGLE(chr) \
PyUnicode_FromUnicode(&(chr), 1)
#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE
#endif

/* Error handling macros */

#define BAD_ROUTE self->route_state
#define BAD_ROUTE_CONTEXT self->route_context
#define FAIL_ROUTE(context) { \
self->route_state = 1; \
self->route_context = context; \
}
#define RESET_ROUTE() self->route_state = 0

/* Shared globals */

extern char** entitydefs;

extern PyObject* NOARGS;
extern PyObject* definitions;

/* Structs */

typedef struct {
Py_ssize_t capacity;
Py_ssize_t length;
#ifdef PEP_393
PyObject* object;
int kind;
void* data;
#else
Py_UNICODE* data;
#endif
} Textbuffer;

struct Stack {
PyObject* stack;
uint64_t context;
Textbuffer* textbuffer;
struct Stack* next;
};
typedef struct Stack Stack;

typedef struct {
PyObject* object; /* base PyUnicodeObject object */
Py_ssize_t length; /* length of object, in code points */
#ifdef PEP_393
int kind; /* object's kind value */
void* data; /* object's raw unicode buffer */
#else
Py_UNICODE* buf; /* object's internal buffer */
#endif
} TokenizerInput;

typedef struct {
PyObject_HEAD
TokenizerInput text; /* text to tokenize */
Stack* topstack; /* topmost stack */
Py_ssize_t head; /* current position in text */
int global; /* global context */
int depth; /* stack recursion depth */
int cycles; /* total number of stack recursions */
int route_state; /* whether a BadRoute has been triggered */
uint64_t route_context; /* context when the last BadRoute was triggered */
int skip_style_tags; /* temp fix for the sometimes broken tag parser */
} Tokenizer;

+ 105
- 0
mwparserfromhell/parser/ctokenizer/contexts.h View File

@@ -0,0 +1,105 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#pragma once

/* Local contexts */

#define LC_TEMPLATE 0x0000000000000007
#define LC_TEMPLATE_NAME 0x0000000000000001
#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002
#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004

#define LC_ARGUMENT 0x0000000000000018
#define LC_ARGUMENT_NAME 0x0000000000000008
#define LC_ARGUMENT_DEFAULT 0x0000000000000010

#define LC_WIKILINK 0x0000000000000060
#define LC_WIKILINK_TITLE 0x0000000000000020
#define LC_WIKILINK_TEXT 0x0000000000000040

#define LC_EXT_LINK 0x0000000000000180
#define LC_EXT_LINK_URI 0x0000000000000080
#define LC_EXT_LINK_TITLE 0x0000000000000100

#define LC_HEADING 0x0000000000007E00
#define LC_HEADING_LEVEL_1 0x0000000000000200
#define LC_HEADING_LEVEL_2 0x0000000000000400
#define LC_HEADING_LEVEL_3 0x0000000000000800
#define LC_HEADING_LEVEL_4 0x0000000000001000
#define LC_HEADING_LEVEL_5 0x0000000000002000
#define LC_HEADING_LEVEL_6 0x0000000000004000

#define LC_TAG 0x0000000000078000
#define LC_TAG_OPEN 0x0000000000008000
#define LC_TAG_ATTR 0x0000000000010000
#define LC_TAG_BODY 0x0000000000020000
#define LC_TAG_CLOSE 0x0000000000040000

#define LC_STYLE 0x0000000000780000
#define LC_STYLE_ITALICS 0x0000000000080000
#define LC_STYLE_BOLD 0x0000000000100000
#define LC_STYLE_PASS_AGAIN 0x0000000000200000
#define LC_STYLE_SECOND_PASS 0x0000000000400000

#define LC_DLTERM 0x0000000000800000

#define LC_SAFETY_CHECK 0x000000007F000000
#define LC_HAS_TEXT 0x0000000001000000
#define LC_FAIL_ON_TEXT 0x0000000002000000
#define LC_FAIL_NEXT 0x0000000004000000
#define LC_FAIL_ON_LBRACE 0x0000000008000000
#define LC_FAIL_ON_RBRACE 0x0000000010000000
#define LC_FAIL_ON_EQUALS 0x0000000020000000
#define LC_HAS_TEMPLATE 0x0000000040000000

#define LC_TABLE 0x0000001F80000000
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000
#define LC_TABLE_OPEN 0x0000000080000000
#define LC_TABLE_CELL_OPEN 0x0000000100000000
#define LC_TABLE_CELL_STYLE 0x0000000200000000
#define LC_TABLE_ROW_OPEN 0x0000000400000000
#define LC_TABLE_TD_LINE 0x0000000800000000
#define LC_TABLE_TH_LINE 0x0000001000000000

/* Global contexts */

#define GL_HEADING 0x1

/* Aggregate contexts */

#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)

/* Tag contexts */

#define TAG_NAME 0x01
#define TAG_ATTR_READY 0x02
#define TAG_ATTR_NAME 0x04
#define TAG_ATTR_VALUE 0x08
#define TAG_QUOTED 0x10
#define TAG_NOTE_SPACE 0x20
#define TAG_NOTE_EQUALS 0x40
#define TAG_NOTE_QUOTE 0x80

+ 78
- 0
mwparserfromhell/parser/ctokenizer/tag_data.c View File

@@ -0,0 +1,78 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#include "tag_data.h"
#include "contexts.h"

/*
Initialize a new TagData object.
*/
TagData* TagData_new(TokenizerInput* text)
{
#define ALLOC_BUFFER(name) \
name = Textbuffer_new(text); \
if (!name) { \
TagData_dealloc(self); \
return NULL; \
}

TagData *self = malloc(sizeof(TagData));
if (!self) {
PyErr_NoMemory();
return NULL;
}
self->context = TAG_NAME;
ALLOC_BUFFER(self->pad_first)
ALLOC_BUFFER(self->pad_before_eq)
ALLOC_BUFFER(self->pad_after_eq)
self->quoter = 0;
self->reset = 0;
return self;

#undef ALLOC_BUFFER
}

/*
Deallocate the given TagData object.
*/
void TagData_dealloc(TagData* self)
{
if (self->pad_first)
Textbuffer_dealloc(self->pad_first);
if (self->pad_before_eq)
Textbuffer_dealloc(self->pad_before_eq);
if (self->pad_after_eq)
Textbuffer_dealloc(self->pad_after_eq);
free(self);
}

/*
Clear the internal buffers of the given TagData object.
*/
int TagData_reset_buffers(TagData* self)
{
if (Textbuffer_reset(self->pad_first) ||
Textbuffer_reset(self->pad_before_eq) ||
Textbuffer_reset(self->pad_after_eq))
return -1;
return 0;
}

+ 43
- 0
mwparserfromhell/parser/ctokenizer/tag_data.h View File

@@ -0,0 +1,43 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#pragma once

#include "common.h"
#include "textbuffer.h"

/* Structs */

typedef struct {
uint64_t context;
Textbuffer* pad_first;
Textbuffer* pad_before_eq;
Textbuffer* pad_after_eq;
Unicode quoter;
Py_ssize_t reset;
} TagData;

/* Functions */

TagData* TagData_new(TokenizerInput*);
void TagData_dealloc(TagData*);
int TagData_reset_buffers(TagData*);

+ 232
- 0
mwparserfromhell/parser/ctokenizer/textbuffer.c View File

@@ -0,0 +1,232 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#include "textbuffer.h"

#define INITIAL_CAPACITY 32
#define RESIZE_FACTOR 2
#define CONCAT_EXTRA 32

/*
Internal allocation function for textbuffers.
*/
static int internal_alloc(Textbuffer* self, Unicode maxchar)
{
self->capacity = INITIAL_CAPACITY;
self->length = 0;

#ifdef PEP_393
self->object = PyUnicode_New(self->capacity, maxchar);
if (!self->object)
return -1;
self->kind = PyUnicode_KIND(self->object);
self->data = PyUnicode_DATA(self->object);
#else
(void) maxchar; // Unused
self->data = malloc(sizeof(Unicode) * self->capacity);
if (!self->data)
return -1;
#endif

return 0;
}

/*
Internal deallocation function for textbuffers.
*/
static void internal_dealloc(Textbuffer* self)
{
#ifdef PEP_393
Py_DECREF(self->object);
#else
free(self->data);
#endif
}

/*
Internal resize function.
*/
static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
{
#ifdef PEP_393
PyObject *newobj;
void *newdata;

newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object));
if (!newobj)
return -1;
newdata = PyUnicode_DATA(newobj);
memcpy(newdata, self->data, self->length * self->kind);
Py_DECREF(self->object);
self->object = newobj;
self->data = newdata;
#else
if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap)))
return -1;
#endif

self->capacity = new_cap;
return 0;
}

/*
Create a new textbuffer object.
*/
Textbuffer* Textbuffer_new(TokenizerInput* text)
{
Textbuffer* self = malloc(sizeof(Textbuffer));
Unicode maxchar = 0;

#ifdef PEP_393
maxchar = PyUnicode_MAX_CHAR_VALUE(text->object);
#endif

if (!self)
goto fail_nomem;
if (internal_alloc(self, maxchar) < 0)
goto fail_dealloc;
return self;

fail_dealloc:
free(self);
fail_nomem:
PyErr_NoMemory();
return NULL;
}

/*
Deallocate the given textbuffer.
*/
void Textbuffer_dealloc(Textbuffer* self)
{
internal_dealloc(self);
free(self);
}

/*
Reset a textbuffer to its initial, empty state.
*/
int Textbuffer_reset(Textbuffer* self)
{
Unicode maxchar = 0;

#ifdef PEP_393
maxchar = PyUnicode_MAX_CHAR_VALUE(self->object);
#endif

internal_dealloc(self);
if (internal_alloc(self, maxchar))
return -1;
return 0;
}

/*
Write a Unicode codepoint to the given textbuffer.
*/
int Textbuffer_write(Textbuffer* self, Unicode code)
{
if (self->length >= self->capacity) {
if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0)
return -1;
}

#ifdef PEP_393
PyUnicode_WRITE(self->kind, self->data, self->length++, code);
#else
self->data[self->length++] = code;
#endif

return 0;
}

/*
Read a Unicode codepoint from the given index of the given textbuffer.

This function does not check for bounds.
*/
Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index)
{
#ifdef PEP_393
return PyUnicode_READ(self->kind, self->data, index);
#else
return self->data[index];
#endif
}

/*
Return the contents of the textbuffer as a Python Unicode object.
*/
PyObject* Textbuffer_render(Textbuffer* self)
{
#ifdef PEP_393
return PyUnicode_FromKindAndData(self->kind, self->data, self->length);
#else
return PyUnicode_FromUnicode(self->data, self->length);
#endif
}

/*
Concatenate the 'other' textbuffer onto the end of the given textbuffer.
*/
int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
{
Py_ssize_t newlen = self->length + other->length;

if (newlen > self->capacity) {
if (internal_resize(self, newlen + CONCAT_EXTRA) < 0)
return -1;
}

#ifdef PEP_393
assert(self->kind == other->kind);
memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data,
other->length * other->kind);
#else
memcpy(self->data + self->length, other->data,
other->length * sizeof(Unicode));
#endif

self->length = newlen;
return 0;
}

/*
Reverse the contents of the given textbuffer.
*/
void Textbuffer_reverse(Textbuffer* self)
{
Py_ssize_t i, end = self->length - 1;
Unicode tmp;

for (i = 0; i < self->length / 2; i++) {
#ifdef PEP_393
tmp = PyUnicode_READ(self->kind, self->data, i);
PyUnicode_WRITE(self->kind, self->data, i,
PyUnicode_READ(self->kind, self->data, end - i));
PyUnicode_WRITE(self->kind, self->data, end - i, tmp);
#else
tmp = self->data[i];
self->data[i] = self->data[end - i];
self->data[end - i] = tmp;
#endif
}
}

+ 36
- 0
mwparserfromhell/parser/ctokenizer/textbuffer.h View File

@@ -0,0 +1,36 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#pragma once

#include "common.h"

/* Functions */

Textbuffer* Textbuffer_new(TokenizerInput*);
void Textbuffer_dealloc(Textbuffer*);
int Textbuffer_reset(Textbuffer*);
int Textbuffer_write(Textbuffer*, Unicode);
Unicode Textbuffer_read(Textbuffer*, Py_ssize_t);
PyObject* Textbuffer_render(Textbuffer*);
int Textbuffer_concat(Textbuffer*, Textbuffer*);
void Textbuffer_reverse(Textbuffer*);

mwparserfromhell/parser/ctokenizer/tok_parse.c
File diff suppressed because it is too large
View File


+ 35
- 0
mwparserfromhell/parser/ctokenizer/tok_parse.h View File

@@ -0,0 +1,35 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#pragma once

#include "common.h"

static const char MARKERS[] = {
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
'-', '!', '\n', '\0'};

#define NUM_MARKERS 19

/* Functions */

PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);

+ 345
- 0
mwparserfromhell/parser/ctokenizer/tok_support.c View File

@@ -0,0 +1,345 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#include "tok_support.h"
#include "textbuffer.h"
#include "tokens.h"

/*
Add a new token stack, context, and textbuffer to the list.
*/
int Tokenizer_push(Tokenizer* self, uint64_t context)
{
Stack* top = malloc(sizeof(Stack));

if (!top) {
PyErr_NoMemory();
return -1;
}
top->stack = PyList_New(0);
top->context = context;
top->textbuffer = Textbuffer_new(&self->text);
if (!top->textbuffer)
return -1;
top->next = self->topstack;
self->topstack = top;
self->depth++;
self->cycles++;
return 0;
}

/*
Push the textbuffer onto the stack as a Text node and clear it.
*/
int Tokenizer_push_textbuffer(Tokenizer* self)
{
PyObject *text, *kwargs, *token;
Textbuffer* buffer = self->topstack->textbuffer;

if (buffer->length == 0)
return 0;
text = Textbuffer_render(buffer);
if (!text)
return -1;
kwargs = PyDict_New();
if (!kwargs) {
Py_DECREF(text);
return -1;
}
PyDict_SetItemString(kwargs, "text", text);
Py_DECREF(text);
token = PyObject_Call(Text, NOARGS, kwargs);
Py_DECREF(kwargs);
if (!token)
return -1;
if (PyList_Append(self->topstack->stack, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
if (Textbuffer_reset(buffer))
return -1;
return 0;
}

/*
Pop and deallocate the top token stack/context/textbuffer.
*/
void Tokenizer_delete_top_of_stack(Tokenizer* self)
{
Stack* top = self->topstack;

Py_DECREF(top->stack);
Textbuffer_dealloc(top->textbuffer);
self->topstack = top->next;
free(top);
self->depth--;
}

/*
Pop the current stack/context/textbuffer, returing the stack.
*/
PyObject* Tokenizer_pop(Tokenizer* self)
{
PyObject* stack;

if (Tokenizer_push_textbuffer(self))
return NULL;
stack = self->topstack->stack;
Py_INCREF(stack);
Tokenizer_delete_top_of_stack(self);
return stack;
}

/*
Pop the current stack/context/textbuffer, returing the stack. We will also
replace the underlying stack's context with the current stack's.
*/
PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
{
PyObject* stack;
uint64_t context;

if (Tokenizer_push_textbuffer(self))
return NULL;
stack = self->topstack->stack;
Py_INCREF(stack);
context = self->topstack->context;
Tokenizer_delete_top_of_stack(self);
self->topstack->context = context;
return stack;
}

/*
Fail the current tokenization route. Discards the current
stack/context/textbuffer and sets the BAD_ROUTE flag.
*/
void* Tokenizer_fail_route(Tokenizer* self)
{
uint64_t context = self->topstack->context;
PyObject* stack = Tokenizer_pop(self);

Py_XDECREF(stack);
FAIL_ROUTE(context);
return NULL;
}

/*
Write a token to the current token stack.
*/
int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first)
{
PyObject* instance;

if (Tokenizer_push_textbuffer(self))
return -1;
instance = PyObject_CallObject(token, NULL);
if (!instance)
return -1;
if (first ? PyList_Insert(self->topstack->stack, 0, instance) :
PyList_Append(self->topstack->stack, instance)) {
Py_DECREF(instance);
return -1;
}
Py_DECREF(instance);
return 0;
}

/*
Write a token to the current token stack, with kwargs. Steals a reference
to kwargs.
*/
int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
PyObject* kwargs, int first)
{
PyObject* instance;

if (Tokenizer_push_textbuffer(self)) {
Py_DECREF(kwargs);
return -1;
}
instance = PyObject_Call(token, NOARGS, kwargs);
if (!instance) {
Py_DECREF(kwargs);
return -1;
}
if (first ? PyList_Insert(self->topstack->stack, 0, instance):
PyList_Append(self->topstack->stack, instance)) {
Py_DECREF(instance);
Py_DECREF(kwargs);
return -1;
}
Py_DECREF(instance);
Py_DECREF(kwargs);
return 0;
}

/*
Write a Unicode codepoint to the current textbuffer.
*/
int Tokenizer_emit_char(Tokenizer* self, Unicode code)
{
return Textbuffer_write(self->topstack->textbuffer, code);
}

/*
Write a string of text to the current textbuffer.
*/
int Tokenizer_emit_text(Tokenizer* self, const char* text)
{
int i = 0;

while (text[i]) {
if (Tokenizer_emit_char(self, text[i]))
return -1;
i++;
}
return 0;
}

/*
Write the contents of another textbuffer to the current textbuffer,
deallocating it in the process.
*/
int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer)
{
int retval = Textbuffer_concat(self->topstack->textbuffer, buffer);
Textbuffer_dealloc(buffer);
return retval;
}

/*
Write a series of tokens to the current stack at once.
*/
int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
{
int pushed = 0;
PyObject *stack, *token, *left, *right, *text;
Textbuffer* buffer;
Py_ssize_t size;

if (PyList_GET_SIZE(tokenlist) > 0) {
token = PyList_GET_ITEM(tokenlist, 0);
switch (PyObject_IsInstance(token, Text)) {
case 0:
break;
case 1: {
pushed = 1;
buffer = self->topstack->textbuffer;
if (buffer->length == 0)
break;
left = Textbuffer_render(buffer);
if (!left)
return -1;
right = PyObject_GetAttrString(token, "text");
if (!right)
return -1;
text = PyUnicode_Concat(left, right);
Py_DECREF(left);
Py_DECREF(right);
if (!text)
return -1;
if (PyObject_SetAttrString(token, "text", text)) {
Py_DECREF(text);
return -1;
}
Py_DECREF(text);
if (Textbuffer_reset(buffer))
return -1;
break;
}
case -1:
return -1;
}
}
if (!pushed) {
if (Tokenizer_push_textbuffer(self))
return -1;
}
stack = self->topstack->stack;
size = PyList_GET_SIZE(stack);
if (PyList_SetSlice(stack, size, size, tokenlist))
return -1;
return 0;
}

/*
Pop the current stack, write text, and then write the stack. 'text' is a
NULL-terminated array of chars.
*/
int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
{
PyObject* stack = Tokenizer_pop(self);

if (Tokenizer_emit_text(self, text)) {
Py_DECREF(stack);
return -1;
}
if (stack) {
if (PyList_GET_SIZE(stack) > 0) {
if (Tokenizer_emit_all(self, stack)) {
Py_DECREF(stack);
return -1;
}
}
Py_DECREF(stack);
}
self->head--;
return 0;
}

/*
Internal function to read the codepoint at the given index from the input.
*/
static Unicode read_codepoint(TokenizerInput* text, Py_ssize_t index)
{
#ifdef PEP_393
return PyUnicode_READ(text->kind, text->data, index);
#else
return text->buf[index];
#endif
}

/*
Read the value at a relative point in the wikicode, forwards.
*/
Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
{
Py_ssize_t index = self->head + delta;

if (index >= self->text.length)
return '\0';
return read_codepoint(&self->text, index);
}

/*
Read the value at a relative point in the wikicode, backwards.
*/
Unicode Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
{
Py_ssize_t index;

if (delta > self->head)
return '\0';
index = self->head - delta;
return read_codepoint(&self->text, index);
}

+ 62
- 0
mwparserfromhell/parser/ctokenizer/tok_support.h View File

@@ -0,0 +1,62 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#pragma once

#include "common.h"

/* Functions */

int Tokenizer_push(Tokenizer*, uint64_t);
int Tokenizer_push_textbuffer(Tokenizer*);
void Tokenizer_delete_top_of_stack(Tokenizer*);
PyObject* Tokenizer_pop(Tokenizer*);
PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
void* Tokenizer_fail_route(Tokenizer*);

int Tokenizer_emit_token(Tokenizer*, PyObject*, int);
int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int);
int Tokenizer_emit_char(Tokenizer*, Unicode);
int Tokenizer_emit_text(Tokenizer*, const char*);
int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*);
int Tokenizer_emit_all(Tokenizer*, PyObject*);
int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);

Unicode Tokenizer_read(Tokenizer*, Py_ssize_t);
Unicode Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);

/* Macros */

#define MAX_DEPTH 40
#define MAX_CYCLES 100000

#define Tokenizer_CAN_RECURSE(self) \
(self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)

#define Tokenizer_emit(self, token) \
Tokenizer_emit_token(self, token, 0)
#define Tokenizer_emit_first(self, token) \
Tokenizer_emit_token(self, token, 1)
#define Tokenizer_emit_kwargs(self, token, kwargs) \
Tokenizer_emit_token_kwargs(self, token, kwargs, 0)
#define Tokenizer_emit_first_kwargs(self, token, kwargs) \
Tokenizer_emit_token_kwargs(self, token, kwargs, 1)

+ 310
- 0
mwparserfromhell/parser/ctokenizer/tokenizer.c View File

@@ -0,0 +1,310 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#include "tokenizer.h"
#include "tok_parse.h"
#include "tokens.h"

/* Globals */

int route_state;
uint64_t route_context;

char** entitydefs;

PyObject* NOARGS;
PyObject* definitions;

static PyObject* ParserError;

/* Forward declarations */

static int load_exceptions(void);

/*
Create a new tokenizer object.
*/
static PyObject*
Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
{
Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0);
return (PyObject*) self;
}

/*
Deallocate the given tokenizer's text field.
*/
static void dealloc_tokenizer_text(TokenizerInput* text)
{
Py_XDECREF(text->object);
}

/*
Deallocate the given tokenizer object.
*/
static void Tokenizer_dealloc(Tokenizer* self)
{
Stack *this = self->topstack, *next;
dealloc_tokenizer_text(&self->text);

while (this) {
Py_DECREF(this->stack);
Textbuffer_dealloc(this->textbuffer);
next = this->next;
free(this);
this = next;
}
Py_TYPE(self)->tp_free((PyObject*) self);
}

/*
Initialize a new tokenizer instance's text field.
*/
static void init_tokenizer_text(TokenizerInput* text)
{
text->object = Py_None;
Py_INCREF(Py_None);
text->length = 0;
#ifdef PEP_393
text->kind = PyUnicode_WCHAR_KIND;
text->data = NULL;
#else
text->buf = NULL;
#endif
}

/*
Initialize a new tokenizer instance by setting instance attributes.
*/
static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
{
static char* kwlist[] = {NULL};

if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
return -1;
init_tokenizer_text(&self->text);
self->topstack = NULL;
self->head = self->global = self->depth = self->cycles = 0;
self->route_context = self->route_state = 0;
self->skip_style_tags = 0;
return 0;
}

/*
Load input text into the tokenizer.
*/
static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
{
dealloc_tokenizer_text(text);
text->object = input;

#ifdef PEP_393
if (PyUnicode_READY(input) < 0)
return -1;
text->kind = PyUnicode_KIND(input);
text->data = PyUnicode_DATA(input);
#else
text->buf = PyUnicode_AS_UNICODE(input);
#endif
text->length = PyUnicode_GET_LENGTH(input);
return 0;
}

/*
Build a list of tokens from a string of wikicode and return it.
*/
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
{
PyObject *input, *tokens;
uint64_t context = 0;
int skip_style_tags = 0;

if (PyArg_ParseTuple(args, "U|ii", &input, &context, &skip_style_tags)) {
Py_INCREF(input);
if (load_tokenizer_text(&self->text, input))
return NULL;
}
else {
const char *encoded;
Py_ssize_t size;

/* Failed to parse a Unicode object; try a string instead. */
PyErr_Clear();
if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context,
&skip_style_tags))
return NULL;
if (!(input = PyUnicode_FromStringAndSize(encoded, size)))
return NULL;
if (load_tokenizer_text(&self->text, input))
return NULL;
}

self->head = self->global = self->depth = self->cycles = 0;
self->skip_style_tags = skip_style_tags;
tokens = Tokenizer_parse(self, context, 1);

if ((!tokens && !PyErr_Occurred()) || self->topstack) {
if (!ParserError) {
if (load_exceptions())
return NULL;
}
if (BAD_ROUTE) {
RESET_ROUTE();
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");
}
else if (self->topstack)
PyErr_SetString(ParserError,
"C tokenizer exited with non-empty token stack");
else
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly");
return NULL;
}
return tokens;
}

static int load_entities(void)
{
PyObject *tempmod, *defmap, *deflist;
unsigned numdefs, i;
#ifdef IS_PY3K
PyObject *string;
#endif

tempmod = PyImport_ImportModule(ENTITYDEFS_MODULE);
if (!tempmod)
return -1;
defmap = PyObject_GetAttrString(tempmod, "entitydefs");
if (!defmap)
return -1;
Py_DECREF(tempmod);
deflist = PyDict_Keys(defmap);
if (!deflist)
return -1;
Py_DECREF(defmap);
numdefs = (unsigned) PyList_GET_SIZE(defmap);
entitydefs = calloc(numdefs + 1, sizeof(char*));
if (!entitydefs)
return -1;
for (i = 0; i < numdefs; i++) {
#ifdef IS_PY3K
string = PyUnicode_AsASCIIString(PyList_GET_ITEM(deflist, i));
if (!string)
return -1;
entitydefs[i] = PyBytes_AsString(string);
#else
entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i));
#endif
if (!entitydefs[i])
return -1;
}
Py_DECREF(deflist);
return 0;
}

static int load_tokens(void)
{
PyObject *tempmod, *tokens,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = IMPORT_NAME_FUNC("tokens");
char *name = "mwparserfromhell.parser";

if (!fromlist || !modname)
return -1;
PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist);
if (!tempmod)
return -1;
tokens = PyObject_GetAttrString(tempmod, "tokens");
Py_DECREF(tempmod);
load_tokens_from_module(tokens);
Py_DECREF(tokens);
return 0;
}

static int load_defs(void)
{
PyObject *tempmod,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = IMPORT_NAME_FUNC("definitions");
char *name = "mwparserfromhell";

if (!fromlist || !modname)
return -1;
PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist);
if (!tempmod)
return -1;
definitions = PyObject_GetAttrString(tempmod, "definitions");
Py_DECREF(tempmod);
return 0;
}

static int load_exceptions(void)
{
PyObject *tempmod, *parsermod,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = IMPORT_NAME_FUNC("parser");
char *name = "mwparserfromhell";

if (!fromlist || !modname)
return -1;
PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist);
if (!tempmod)
return -1;
parsermod = PyObject_GetAttrString(tempmod, "parser");
Py_DECREF(tempmod);
ParserError = PyObject_GetAttrString(parsermod, "ParserError");
Py_DECREF(parsermod);
return 0;
}

PyMODINIT_FUNC INIT_FUNC_NAME(void)
{
PyObject *module;

TokenizerType.tp_new = PyType_GenericNew;
if (PyType_Ready(&TokenizerType) < 0)
INIT_ERROR;
module = CREATE_MODULE;
if (!module)
INIT_ERROR;
Py_INCREF(&TokenizerType);
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
Py_INCREF(Py_True);
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
NOARGS = PyTuple_New(0);
if (!NOARGS || load_entities() || load_tokens() || load_defs())
INIT_ERROR;
#ifdef IS_PY3K
return module;
#endif
}

+ 111
- 0
mwparserfromhell/parser/ctokenizer/tokenizer.h View File

@@ -0,0 +1,111 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#pragma once

#include "common.h"
#include "textbuffer.h"

/* Functions */

static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
static void Tokenizer_dealloc(Tokenizer*);
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);

/* Compatibility macros */

#ifdef IS_PY3K
#define IMPORT_NAME_FUNC PyUnicode_FromString
#define CREATE_MODULE PyModule_Create(&module_def);
#define ENTITYDEFS_MODULE "html.entities"
#define INIT_FUNC_NAME PyInit__tokenizer
#define INIT_ERROR return NULL
#else
#define IMPORT_NAME_FUNC PyBytes_FromString
#define CREATE_MODULE Py_InitModule("_tokenizer", NULL);
#define ENTITYDEFS_MODULE "htmlentitydefs"
#define INIT_FUNC_NAME init_tokenizer
#define INIT_ERROR return
#endif

/* Structs */

static PyMethodDef Tokenizer_methods[] = {
{"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS,
"Build a list of tokens from a string of wikicode and return it."},
{NULL}
};

static PyMemberDef Tokenizer_members[] = {
{NULL}
};

static PyTypeObject TokenizerType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_tokenizer.CTokenizer", /* tp_name */
sizeof(Tokenizer), /* tp_basicsize */
0, /* tp_itemsize */
(destructor) Tokenizer_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT, /* tp_flags */
"Creates a list of tokens from a string of wikicode.", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
Tokenizer_methods, /* tp_methods */
Tokenizer_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc) Tokenizer_init, /* tp_init */
0, /* tp_alloc */
Tokenizer_new, /* tp_new */
};

#ifdef IS_PY3K
static PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_tokenizer",
"Creates a list of tokens from a string of wikicode.",
-1, NULL, NULL, NULL, NULL, NULL
};
#endif

+ 111
- 0
mwparserfromhell/parser/ctokenizer/tokens.c View File

@@ -0,0 +1,111 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#include "tokens.h"

/* Globals */

PyObject* Text;

PyObject* TemplateOpen;
PyObject* TemplateParamSeparator;
PyObject* TemplateParamEquals;
PyObject* TemplateClose;

PyObject* ArgumentOpen;
PyObject* ArgumentSeparator;
PyObject* ArgumentClose;

PyObject* WikilinkOpen;
PyObject* WikilinkSeparator;
PyObject* WikilinkClose;

PyObject* ExternalLinkOpen;
PyObject* ExternalLinkSeparator;
PyObject* ExternalLinkClose;

PyObject* HTMLEntityStart;
PyObject* HTMLEntityNumeric;
PyObject* HTMLEntityHex;
PyObject* HTMLEntityEnd;
PyObject* HeadingStart;
PyObject* HeadingEnd;

PyObject* CommentStart;
PyObject* CommentEnd;

PyObject* TagOpenOpen;
PyObject* TagAttrStart;
PyObject* TagAttrEquals;
PyObject* TagAttrQuote;
PyObject* TagCloseOpen;
PyObject* TagCloseSelfclose;
PyObject* TagOpenClose;
PyObject* TagCloseClose;

/*
Load individual tokens into globals from the given Python module object.
*/
void load_tokens_from_module(PyObject* module)
{
Text = PyObject_GetAttrString(module, "Text");

TemplateOpen = PyObject_GetAttrString(module, "TemplateOpen");
TemplateParamSeparator = PyObject_GetAttrString(module,
"TemplateParamSeparator");
TemplateParamEquals = PyObject_GetAttrString(module,
"TemplateParamEquals");
TemplateClose = PyObject_GetAttrString(module, "TemplateClose");

ArgumentOpen = PyObject_GetAttrString(module, "ArgumentOpen");
ArgumentSeparator = PyObject_GetAttrString(module, "ArgumentSeparator");
ArgumentClose = PyObject_GetAttrString(module, "ArgumentClose");

WikilinkOpen = PyObject_GetAttrString(module, "WikilinkOpen");
WikilinkSeparator = PyObject_GetAttrString(module, "WikilinkSeparator");
WikilinkClose = PyObject_GetAttrString(module, "WikilinkClose");

ExternalLinkOpen = PyObject_GetAttrString(module, "ExternalLinkOpen");
ExternalLinkSeparator = PyObject_GetAttrString(module,
"ExternalLinkSeparator");
ExternalLinkClose = PyObject_GetAttrString(module, "ExternalLinkClose");

HTMLEntityStart = PyObject_GetAttrString(module, "HTMLEntityStart");
HTMLEntityNumeric = PyObject_GetAttrString(module, "HTMLEntityNumeric");
HTMLEntityHex = PyObject_GetAttrString(module, "HTMLEntityHex");
HTMLEntityEnd = PyObject_GetAttrString(module, "HTMLEntityEnd");

HeadingStart = PyObject_GetAttrString(module, "HeadingStart");
HeadingEnd = PyObject_GetAttrString(module, "HeadingEnd");

CommentStart = PyObject_GetAttrString(module, "CommentStart");
CommentEnd = PyObject_GetAttrString(module, "CommentEnd");

TagOpenOpen = PyObject_GetAttrString(module, "TagOpenOpen");
TagAttrStart = PyObject_GetAttrString(module, "TagAttrStart");
TagAttrEquals = PyObject_GetAttrString(module, "TagAttrEquals");
TagAttrQuote = PyObject_GetAttrString(module, "TagAttrQuote");
TagCloseOpen = PyObject_GetAttrString(module, "TagCloseOpen");
TagCloseSelfclose = PyObject_GetAttrString(module, "TagCloseSelfclose");
TagOpenClose = PyObject_GetAttrString(module, "TagOpenClose");
TagCloseClose = PyObject_GetAttrString(module, "TagCloseClose");
}

+ 69
- 0
mwparserfromhell/parser/ctokenizer/tokens.h View File

@@ -0,0 +1,69 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#pragma once

#include "common.h"

/* Token globals */

extern PyObject* Text;

extern PyObject* TemplateOpen;
extern PyObject* TemplateParamSeparator;
extern PyObject* TemplateParamEquals;
extern PyObject* TemplateClose;

extern PyObject* ArgumentOpen;
extern PyObject* ArgumentSeparator;
extern PyObject* ArgumentClose;

extern PyObject* WikilinkOpen;
extern PyObject* WikilinkSeparator;
extern PyObject* WikilinkClose;

extern PyObject* ExternalLinkOpen;
extern PyObject* ExternalLinkSeparator;
extern PyObject* ExternalLinkClose;

extern PyObject* HTMLEntityStart;
extern PyObject* HTMLEntityNumeric;
extern PyObject* HTMLEntityHex;
extern PyObject* HTMLEntityEnd;
extern PyObject* HeadingStart;
extern PyObject* HeadingEnd;

extern PyObject* CommentStart;
extern PyObject* CommentEnd;

extern PyObject* TagOpenOpen;
extern PyObject* TagAttrStart;
extern PyObject* TagAttrEquals;
extern PyObject* TagAttrQuote;
extern PyObject* TagCloseOpen;
extern PyObject* TagCloseSelfclose;
extern PyObject* TagOpenClose;
extern PyObject* TagCloseClose;

/* Functions */

void load_tokens_from_module(PyObject*);

+ 0
- 367
mwparserfromhell/parser/tokenizer.h View File

@@ -1,367 +0,0 @@
/*
Tokenizer Header File for MWParserFromHell
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#ifndef PY_SSIZE_T_CLEAN
#define PY_SSIZE_T_CLEAN
#endif

#include <Python.h>
#include <math.h>
#include <structmember.h>
#include <bytesobject.h>
#include <stdint.h>

#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif

#define malloc PyObject_Malloc
#define free PyObject_Free

#define DIGITS "0123456789"
#define HEXDIGITS "0123456789abcdefABCDEF"
#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

static const char MARKERS[] = {
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
'-', '!', '\n', '\0'};

#define NUM_MARKERS 19
#define TEXTBUFFER_BLOCKSIZE 1024
#define MAX_DEPTH 40
#define MAX_CYCLES 100000
#define MAX_BRACES 255
#define MAX_ENTITY_SIZE 8

static int route_state = 0;
static uint64_t route_context = 0;
#define BAD_ROUTE route_state
#define BAD_ROUTE_CONTEXT route_context
#define FAIL_ROUTE(context) route_state = 1; route_context = context
#define RESET_ROUTE() route_state = 0

static char** entitydefs;

static PyObject* EMPTY;
static PyObject* NOARGS;
static PyObject* ParserError;
static PyObject* definitions;


/* Tokens: */

static PyObject* Text;

static PyObject* TemplateOpen;
static PyObject* TemplateParamSeparator;
static PyObject* TemplateParamEquals;
static PyObject* TemplateClose;

static PyObject* ArgumentOpen;
static PyObject* ArgumentSeparator;
static PyObject* ArgumentClose;

static PyObject* WikilinkOpen;
static PyObject* WikilinkSeparator;
static PyObject* WikilinkClose;

static PyObject* ExternalLinkOpen;
static PyObject* ExternalLinkSeparator;
static PyObject* ExternalLinkClose;

static PyObject* HTMLEntityStart;
static PyObject* HTMLEntityNumeric;
static PyObject* HTMLEntityHex;
static PyObject* HTMLEntityEnd;
static PyObject* HeadingStart;
static PyObject* HeadingEnd;

static PyObject* CommentStart;
static PyObject* CommentEnd;

static PyObject* TagOpenOpen;
static PyObject* TagAttrStart;
static PyObject* TagAttrEquals;
static PyObject* TagAttrQuote;
static PyObject* TagCloseOpen;
static PyObject* TagCloseSelfclose;
static PyObject* TagOpenClose;
static PyObject* TagCloseClose;


/* Local contexts: */

#define LC_TEMPLATE 0x0000000000000007
#define LC_TEMPLATE_NAME 0x0000000000000001
#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002
#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004

#define LC_ARGUMENT 0x0000000000000018
#define LC_ARGUMENT_NAME 0x0000000000000008
#define LC_ARGUMENT_DEFAULT 0x0000000000000010

#define LC_WIKILINK 0x0000000000000060
#define LC_WIKILINK_TITLE 0x0000000000000020
#define LC_WIKILINK_TEXT 0x0000000000000040

#define LC_EXT_LINK 0x0000000000000180
#define LC_EXT_LINK_URI 0x0000000000000080
#define LC_EXT_LINK_TITLE 0x0000000000000100

#define LC_HEADING 0x0000000000007E00
#define LC_HEADING_LEVEL_1 0x0000000000000200
#define LC_HEADING_LEVEL_2 0x0000000000000400
#define LC_HEADING_LEVEL_3 0x0000000000000800
#define LC_HEADING_LEVEL_4 0x0000000000001000
#define LC_HEADING_LEVEL_5 0x0000000000002000
#define LC_HEADING_LEVEL_6 0x0000000000004000

#define LC_TAG 0x0000000000078000
#define LC_TAG_OPEN 0x0000000000008000
#define LC_TAG_ATTR 0x0000000000010000
#define LC_TAG_BODY 0x0000000000020000
#define LC_TAG_CLOSE 0x0000000000040000

#define LC_STYLE 0x0000000000780000
#define LC_STYLE_ITALICS 0x0000000000080000
#define LC_STYLE_BOLD 0x0000000000100000
#define LC_STYLE_PASS_AGAIN 0x0000000000200000
#define LC_STYLE_SECOND_PASS 0x0000000000400000

#define LC_DLTERM 0x0000000000800000

#define LC_SAFETY_CHECK 0x000000007F000000
#define LC_HAS_TEXT 0x0000000001000000
#define LC_FAIL_ON_TEXT 0x0000000002000000
#define LC_FAIL_NEXT 0x0000000004000000
#define LC_FAIL_ON_LBRACE 0x0000000008000000
#define LC_FAIL_ON_RBRACE 0x0000000010000000
#define LC_FAIL_ON_EQUALS 0x0000000020000000
#define LC_HAS_TEMPLATE 0x0000000040000000

#define LC_TABLE 0x0000001F80000000
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000
#define LC_TABLE_OPEN 0x0000000080000000
#define LC_TABLE_CELL_OPEN 0x0000000100000000
#define LC_TABLE_CELL_STYLE 0x0000000200000000
#define LC_TABLE_ROW_OPEN 0x0000000400000000
#define LC_TABLE_TD_LINE 0x0000000800000000
#define LC_TABLE_TH_LINE 0x0000001000000000

/* Global contexts: */

#define GL_HEADING 0x1

/* Aggregate contexts: */

#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)

/* Tag contexts: */

#define TAG_NAME 0x01
#define TAG_ATTR_READY 0x02
#define TAG_ATTR_NAME 0x04
#define TAG_ATTR_VALUE 0x08
#define TAG_QUOTED 0x10
#define TAG_NOTE_SPACE 0x20
#define TAG_NOTE_EQUALS 0x40
#define TAG_NOTE_QUOTE 0x80


/* Miscellaneous structs: */

struct Textbuffer {
Py_ssize_t size;
Py_UNICODE* data;
struct Textbuffer* prev;
struct Textbuffer* next;
};

struct Stack {
PyObject* stack;
uint64_t context;
struct Textbuffer* textbuffer;
struct Stack* next;
};

typedef struct {
PyObject* title;
int level;
} HeadingData;

typedef struct {
uint64_t context;
struct Textbuffer* pad_first;
struct Textbuffer* pad_before_eq;
struct Textbuffer* pad_after_eq;
Py_UNICODE quoter;
Py_ssize_t reset;
} TagData;

typedef struct Textbuffer Textbuffer;
typedef struct Stack Stack;


/* Tokenizer object definition: */

typedef struct {
PyObject_HEAD
PyObject* text; /* text to tokenize */
Stack* topstack; /* topmost stack */
Py_ssize_t head; /* current position in text */
Py_ssize_t length; /* length of text */
int global; /* global context */
int depth; /* stack recursion depth */
int cycles; /* total number of stack recursions */
int skip_style_tags; /* temporary fix for the sometimes broken tag parser */
} Tokenizer;


/* Macros related to Tokenizer functions: */

#define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta)))
#define Tokenizer_READ_BACKWARDS(self, delta) \
(*PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, delta)))
#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)

#define Tokenizer_emit(self, token) Tokenizer_emit_token(self, token, 0)
#define Tokenizer_emit_first(self, token) Tokenizer_emit_token(self, token, 1)
#define Tokenizer_emit_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 0)
#define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1)


/* Macros for accessing definitions: */

#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL))
#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL))
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL))
#define IS_SCHEME(scheme, slashes, reverse) \
(call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False))


/* Function prototypes: */

static Textbuffer* Textbuffer_new(void);
static void Textbuffer_dealloc(Textbuffer*);

static TagData* TagData_new(void);
static void TagData_dealloc(TagData*);

static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
static void Tokenizer_dealloc(Tokenizer*);
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
static int Tokenizer_parse_entity(Tokenizer*);
static int Tokenizer_parse_comment(Tokenizer*);
static int Tokenizer_handle_dl_term(Tokenizer*);
static int Tokenizer_parse_tag(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);

static int load_exceptions(void);


/* Macros for Python 2/3 compatibility: */

#ifdef IS_PY3K
#define NEW_INT_FUNC PyLong_FromSsize_t
#define IMPORT_NAME_FUNC PyUnicode_FromString
#define CREATE_MODULE PyModule_Create(&module_def);
#define ENTITYDEFS_MODULE "html.entities"
#define INIT_FUNC_NAME PyInit__tokenizer
#define INIT_ERROR return NULL
#else
#define NEW_INT_FUNC PyInt_FromSsize_t
#define IMPORT_NAME_FUNC PyBytes_FromString
#define CREATE_MODULE Py_InitModule("_tokenizer", NULL);
#define ENTITYDEFS_MODULE "htmlentitydefs"
#define INIT_FUNC_NAME init_tokenizer
#define INIT_ERROR return
#endif


/* More structs for creating the Tokenizer type: */

static PyMethodDef Tokenizer_methods[] = {
{"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS,
"Build a list of tokens from a string of wikicode and return it."},
{NULL}
};

static PyMemberDef Tokenizer_members[] = {
{NULL}
};

static PyTypeObject TokenizerType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_tokenizer.CTokenizer", /* tp_name */
sizeof(Tokenizer), /* tp_basicsize */
0, /* tp_itemsize */
(destructor) Tokenizer_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT, /* tp_flags */
"Creates a list of tokens from a string of wikicode.", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
Tokenizer_methods, /* tp_methods */
Tokenizer_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc) Tokenizer_init, /* tp_init */
0, /* tp_alloc */
Tokenizer_new, /* tp_new */
};

#ifdef IS_PY3K
static PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_tokenizer",
"Creates a list of tokens from a string of wikicode.",
-1, NULL, NULL, NULL, NULL, NULL
};
#endif

+ 42
- 58
mwparserfromhell/smart_list.py View File

@@ -27,8 +27,10 @@ reflect changes made to the main list, and vice-versa.
"""

from __future__ import unicode_literals
from sys import maxsize
from weakref import ref

from .compat import maxsize, py3k
from .compat import py3k

__all__ = ["SmartList"]

@@ -45,16 +47,16 @@ def inheritdoc(method):
class _SliceNormalizerMixIn(object):
"""MixIn that provides a private method to normalize slices."""

def _normalize_slice(self, key):
def _normalize_slice(self, key, clamp=False):
"""Return a slice equivalent to the input *key*, standardized."""
if key.start is not None:
if key.start is None:
start = 0
else:
start = (len(self) + key.start) if key.start < 0 else key.start
if key.stop is None or key.stop == maxsize:
stop = len(self) if clamp else None
else:
start = 0
if key.stop is not None:
stop = (len(self) + key.stop) if key.stop < 0 else key.stop
else:
stop = maxsize
return slice(start, stop, key.step or 1)


@@ -80,13 +82,6 @@ class SmartList(_SliceNormalizerMixIn, list):
[2, 3, 4]
>>> parent
[0, 1, 2, 3, 4]

The parent needs to keep a list of its children in order to update them,
which prevents them from being garbage-collected. If you are keeping the
parent around for a while but creating many children, it is advisable to
call :meth:`._ListProxy.detach` when you're finished with them. Certain
parent methods, like :meth:`reverse` and :meth:`sort`, will do this
automatically.
"""

def __init__(self, iterable=None):
@@ -99,10 +94,11 @@ class SmartList(_SliceNormalizerMixIn, list):
def __getitem__(self, key):
if not isinstance(key, slice):
return super(SmartList, self).__getitem__(key)
key = self._normalize_slice(key)
key = self._normalize_slice(key, clamp=False)
sliceinfo = [key.start, key.stop, key.step]
child = _ListProxy(self, sliceinfo)
self._children[id(child)] = (child, sliceinfo)
child_ref = ref(child, self._delete_child)
self._children[id(child_ref)] = (child_ref, sliceinfo)
return child

def __setitem__(self, key, item):
@@ -110,20 +106,21 @@ class SmartList(_SliceNormalizerMixIn, list):
return super(SmartList, self).__setitem__(key, item)
item = list(item)
super(SmartList, self).__setitem__(key, item)
key = self._normalize_slice(key)
key = self._normalize_slice(key, clamp=True)
diff = len(item) + (key.start - key.stop) // key.step
if not diff:
return
values = self._children.values if py3k else self._children.itervalues
if diff:
for child, (start, stop, step) in values():
if start > key.stop:
self._children[id(child)][1][0] += diff
if stop >= key.stop and stop != maxsize:
self._children[id(child)][1][1] += diff
for child, (start, stop, step) in values():
if start > key.stop:
self._children[id(child)][1][0] += diff
if stop is not None and stop >= key.stop:
self._children[id(child)][1][1] += diff

def __delitem__(self, key):
super(SmartList, self).__delitem__(key)
if isinstance(key, slice):
key = self._normalize_slice(key)
key = self._normalize_slice(key, clamp=True)
else:
key = slice(key, key + 1, 1)
diff = (key.stop - key.start) // key.step
@@ -131,7 +128,7 @@ class SmartList(_SliceNormalizerMixIn, list):
for child, (start, stop, step) in values():
if start > key.start:
self._children[id(child)][1][0] -= diff
if stop >= key.stop and stop != maxsize:
if stop is not None and stop >= key.stop:
self._children[id(child)][1][1] -= diff

if not py3k:
@@ -154,10 +151,16 @@ class SmartList(_SliceNormalizerMixIn, list):
self.extend(other)
return self

def _delete_child(self, child_ref):
"""Remove a child reference that is about to be garbage-collected."""
del self._children[id(child_ref)]

def _detach_children(self):
"""Remove all children and give them independent parent copies."""
children = [val[0] for val in self._children.values()]
for child in children:
child.detach()
child()._parent = list(self)
self._children.clear()

@inheritdoc
def append(self, item):
@@ -226,7 +229,6 @@ class _ListProxy(_SliceNormalizerMixIn, list):
super(_ListProxy, self).__init__()
self._parent = parent
self._sliceinfo = sliceinfo
self._detached = False

def __repr__(self):
return repr(self._render())
@@ -273,24 +275,20 @@ class _ListProxy(_SliceNormalizerMixIn, list):

def __getitem__(self, key):
if isinstance(key, slice):
key = self._normalize_slice(key)
if key.stop == maxsize:
keystop = self._stop
else:
keystop = key.stop + self._start
adjusted = slice(key.start + self._start, keystop, key.step)
key = self._normalize_slice(key, clamp=True)
keystart = min(self._start + key.start, self._stop)
keystop = min(self._start + key.stop, self._stop)
adjusted = slice(keystart, keystop, key.step)
return self._parent[adjusted]
else:
return self._render()[key]

def __setitem__(self, key, item):
if isinstance(key, slice):
key = self._normalize_slice(key)
if key.stop == maxsize:
keystop = self._stop
else:
keystop = key.stop + self._start
adjusted = slice(key.start + self._start, keystop, key.step)
key = self._normalize_slice(key, clamp=True)
keystart = min(self._start + key.start, self._stop)
keystop = min(self._start + key.stop, self._stop)
adjusted = slice(keystart, keystop, key.step)
self._parent[adjusted] = item
else:
length = len(self)
@@ -302,12 +300,10 @@ class _ListProxy(_SliceNormalizerMixIn, list):

def __delitem__(self, key):
if isinstance(key, slice):
key = self._normalize_slice(key)
if key.stop == maxsize:
keystop = self._stop
else:
keystop = key.stop + self._start
adjusted = slice(key.start + self._start, keystop, key.step)
key = self._normalize_slice(key, clamp=True)
keystart = min(self._start + key.start, self._stop)
keystop = min(self._start + key.stop, self._stop)
adjusted = slice(keystart, keystop, key.step)
del self._parent[adjusted]
else:
length = len(self)
@@ -370,7 +366,7 @@ class _ListProxy(_SliceNormalizerMixIn, list):
@property
def _stop(self):
"""The ending index of this list, exclusive."""
if self._sliceinfo[1] == maxsize:
if self._sliceinfo[1] is None:
return len(self._parent)
return self._sliceinfo[1]

@@ -456,17 +452,5 @@ class _ListProxy(_SliceNormalizerMixIn, list):
item.sort(**kwargs)
self._parent[self._start:self._stop:self._step] = item

def detach(self):
"""Detach the child so it operates like a normal list.

This allows children to be properly garbage-collected if their parent
is being kept around for a long time. This method has no effect if the
child is already detached.
"""
if not self._detached:
self._parent._children.pop(id(self))
self._parent = list(self._parent)
self._detached = True


del inheritdoc

+ 3
- 0
scripts/README View File

@@ -0,0 +1,3 @@
This directory contains support files used for *developing* mwparserfromhell,
not running it. If you are looking for code examples, read the documentation
or explore the source code.

+ 11
- 11
scripts/release.sh View File

@@ -31,6 +31,13 @@ update_version() {
echo " done."
}

update_appveyor() {
filename="appveyor.yml"
echo -n "Updating $filename..."
sed -e "s/version: .*/version: $VERSION-b{build}/" -i "" $filename
echo " done."
}

update_changelog() {
filename="CHANGELOG"
echo -n "Updating $filename..."
@@ -67,25 +74,18 @@ do_git_stuff() {
}

upload_to_pypi() {
# TODO: check whether these commands give output
echo -n "PyPI: uploading source tarball and docs..."
python setup.py register sdist upload -s
python setup.py upload_docs
python setup.py -q register sdist upload -s
python setup.py -q upload_docs
echo " done."
}

windows_build() {
echo "PyPI: building/uploading Windows binaries..."
echo "*** Run in Windows: ./scripts/win_build.py"
echo "*** Press enter when done."
read
}

post_release() {
echo
echo "*** Release completed."
echo "*** Update: https://github.com/earwig/mwparserfromhell/releases/tag/v$VERSION"
echo "*** Verify: https://pypi.python.org/pypi/mwparserfromhell"
echo "*** Verify: https://ci.appveyor.com/project/earwig/mwparserfromhell"
echo "*** Verify: https://mwparserfromhell.readthedocs.org"
echo "*** Press enter to sanity-check the release."
read
@@ -153,11 +153,11 @@ cd "$SCRIPT_DIR/.."

check_git
update_version
update_appveyor
update_changelog
update_docs_changelog
do_git_stuff
upload_to_pypi
windows_build
post_release
test_release



+ 0
- 58
scripts/win_build.py View File

@@ -1,58 +0,0 @@
# Build requirements:
#
# Python 2.6-3.2: Visual C++ Express Edition 2008:
# http://go.microsoft.com/?linkid=7729279
#
# Python 3.3+: Visual C++ Express Edition 2010:
# http://go.microsoft.com/?linkid=9709949
#
# x64 builds: Microsoft Windows SDK for Windows 7 and .NET Framework 3.5 SP1:
# http://www.microsoft.com/en-us/download/details.aspx?id=3138
#
# Python interpreter, 2.6, 2.7, 3.2-3.4:
# https://www.python.org/downloads/
#
# Pip, setuptools, wheel:
# https://bootstrap.pypa.io/get-pip.py
# and run *for each* Python version:
# c:\pythonXX\python get-pip.py
# c:\pythonXX\scripts\pip install wheel
#
# Afterwards, run this script with any of the python interpreters (2.7 suggested)

from __future__ import print_function
import os
from subprocess import call, STDOUT

ENVIRONMENTS = ["26", "27", "32", "33", "34"]

def run(pyver, cmds):
cmd = [r"C:\Python%s\Python.exe" % pyver, "setup.py"] + cmds
print(" ".join(cmd), end=" ")

with open("%s%s.log" % (cmds[0], pyver), "w") as logfile:
retval = call(cmd, stdout=logfile, stderr=STDOUT, cwd="..")
if not retval:
print("[OK]")
else:
print("[FAILED (%i)]" % retval)
return retval

def main():
path = os.path.split(__file__)[0]
if path:
os.chdir(path)

print("Building Windows wheels for Python %s:" % ", ".join(ENVIRONMENTS))
for pyver in ENVIRONMENTS:
print()
try:
os.unlink("mwparserfromhell/parser/_tokenizer.pyd")
except OSError:
pass

if run(pyver, ["test"]) == 0:
run(pyver, ["bdist_wheel", "upload"]) # TODO: add "-s" to GPG sign

if __name__ == "__main__":
main()

+ 43
- 0
scripts/win_wrapper.cmd View File

@@ -0,0 +1,43 @@
:: To build extensions for 64 bit Python 3, we need to configure environment
:: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of:
:: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1)
::
:: To build extensions for 64 bit Python 2, we need to configure environment
:: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of:
:: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0)
::
:: 32 bit builds do not require specific environment configurations.
::
:: Note: this script needs to be run with the /E:ON and /V:ON flags for the
:: cmd interpreter, at least for (SDK v7.0)
::
:: More details at:
:: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows
:: http://stackoverflow.com/a/13751649/163740
::
:: Author: Olivier Grisel
:: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
@ECHO OFF

SET COMMAND_TO_RUN=%*
SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows

SET MAJOR_PYTHON_VERSION="%PYTHON_VERSION:~0,1%"
IF %MAJOR_PYTHON_VERSION% == "2" (
SET WINDOWS_SDK_VERSION="v7.0"
) ELSE IF %MAJOR_PYTHON_VERSION% == "3" (
SET WINDOWS_SDK_VERSION="v7.1"
) ELSE (
ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%"
EXIT 1
)

IF "%PYTHON_ARCH%"=="64" (
SET DISTUTILS_USE_SDK=1
SET MSSdk=1
"%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION%
"%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release
call %COMMAND_TO_RUN% || EXIT 1
) ELSE (
call %COMMAND_TO_RUN% || EXIT 1
)

+ 38
- 59
setup.py View File

@@ -21,17 +21,18 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
from __future__ import print_function
from distutils.errors import DistutilsError, CCompilerError
from glob import glob
from os import environ
import sys

if (sys.version_info[0] == 2 and sys.version_info[1] < 6) or \
(sys.version_info[1] == 3 and sys.version_info[1] < 2):
raise Exception("mwparserfromhell needs Python 2.6+ or 3.2+")

if sys.version_info >= (3, 0):
basestring = (str, )
if ((sys.version_info[0] == 2 and sys.version_info[1] < 6) or
(sys.version_info[1] == 3 and sys.version_info[1] < 2)):
raise RuntimeError("mwparserfromhell needs Python 2.6+ or 3.2+")

from setuptools import setup, find_packages, Extension
from setuptools.command.build_ext import build_ext

from mwparserfromhell import __version__
from mwparserfromhell.compat import py26, py3k
@@ -39,70 +40,48 @@ from mwparserfromhell.compat import py26, py3k
with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp:
long_docs = fp.read()

tokenizer = Extension("mwparserfromhell.parser._tokenizer",
sources=["mwparserfromhell/parser/tokenizer.c"],
depends=["mwparserfromhell/parser/tokenizer.h"])

use_extension = True
fallback = True

# Allow env var WITHOUT_EXTENSION and args --with[out]-extension
if '--without-extension' in sys.argv:
use_extension = False
elif '--with-extension' in sys.argv:
pass
elif os.environ.get('WITHOUT_EXTENSION', '0') == '1':
use_extension = False
# Allow env var WITHOUT_EXTENSION and args --with[out]-extension:

# Remove the command line argument as it isn't understood by
# setuptools/distutils
sys.argv = [arg for arg in sys.argv
if not arg.startswith('--with')
and not arg.endswith('-extension')]


def optional_compile_setup(func=setup, use_ext=use_extension,
*args, **kwargs):
"""
Wrap setup to allow optional compilation of extensions.
env_var = environ.get("WITHOUT_EXTENSION")
if "--without-extension" in sys.argv:
use_extension = False
elif "--with-extension" in sys.argv:
fallback = False
elif env_var is not None:
if env_var == "1":
use_extension = False
elif env_var == "0":
fallback = False

Falls back to pure python mode (no extensions)
if compilation of extensions fails.
"""
extensions = kwargs.get('ext_modules', None)
# Remove the command line argument as it isn't understood by setuptools:

if use_ext and extensions:
try:
func(*args, **kwargs)
return
except SystemExit as e:
assert(e.args)
if e.args[0] is False:
raise
elif isinstance(e.args[0], basestring):
if e.args[0].startswith('usage: '):
raise
else:
# Fallback to pure python mode
print('setup with extension failed: %s' % repr(e))
pass
except Exception as e:
print('setup with extension failed: %s' % repr(e))
sys.argv = [arg for arg in sys.argv
if arg != "--without-extension" and arg != "--with-extension"]

if extensions:
if use_ext:
print('Falling back to pure python mode.')
else:
print('Using pure python mode.')
def build_ext_patched(self):
try:
build_ext_original(self)
except (DistutilsError, CCompilerError) as exc:
print("error: " + str(exc))
print("Falling back to pure Python mode.")
del self.extensions[:]

del kwargs['ext_modules']
if fallback:
build_ext.run, build_ext_original = build_ext_patched, build_ext.run

func(*args, **kwargs)
# Project-specific part begins here:

tokenizer = Extension("mwparserfromhell.parser._tokenizer",
sources=glob("mwparserfromhell/parser/ctokenizer/*.c"),
depends=glob("mwparserfromhell/parser/ctokenizer/*.h"))

optional_compile_setup(
setup(
name = "mwparserfromhell",
packages = find_packages(exclude=("tests",)),
ext_modules = [tokenizer],
ext_modules = [tokenizer] if use_extension else [],
tests_require = ["unittest2"] if py26 else [],
test_suite = "tests.discover",
version = __version__,


+ 41
- 26
tests/_test_tokenizer.py View File

@@ -42,8 +42,8 @@ class TokenizerTestCase(object):
directory.
"""

@classmethod
def _build_test_method(cls, funcname, data):
@staticmethod
def _build_test_method(funcname, data):
"""Create and return a method to be treated as a test case method.

*data* is a dict containing multiple keys: the *input* text to be
@@ -58,13 +58,35 @@ class TokenizerTestCase(object):
expected = data["output"]
actual = self.tokenizer().tokenize(data["input"])
self.assertEqual(expected, actual)

if not py3k:
inner.__name__ = funcname.encode("utf8")
inner.__doc__ = data["label"]
return inner

@staticmethod
def _parse_test(test, data):
"""Parse an individual *test*, storing its info in *data*."""
for line in test.strip().splitlines():
if line.startswith("name:"):
data["name"] = line[len("name:"):].strip()
elif line.startswith("label:"):
data["label"] = line[len("label:"):].strip()
elif line.startswith("input:"):
raw = line[len("input:"):].strip()
if raw[0] == '"' and raw[-1] == '"':
raw = raw[1:-1]
raw = raw.encode("raw_unicode_escape")
data["input"] = raw.decode("unicode_escape")
elif line.startswith("output:"):
raw = line[len("output:"):].strip()
try:
data["output"] = eval(raw, vars(tokens))
except Exception as err:
raise _TestParseError(err)

@classmethod
def _load_tests(cls, filename, name, text):
def _load_tests(cls, filename, name, text, restrict=None):
"""Load all tests in *text* from the file *filename*."""
tests = text.split("\n---\n")
counter = 1
@@ -72,23 +94,7 @@ class TokenizerTestCase(object):
for test in tests:
data = {"name": None, "label": None, "input": None, "output": None}
try:
for line in test.strip().splitlines():
if line.startswith("name:"):
data["name"] = line[len("name:"):].strip()
elif line.startswith("label:"):
data["label"] = line[len("label:"):].strip()
elif line.startswith("input:"):
raw = line[len("input:"):].strip()
if raw[0] == '"' and raw[-1] == '"':
raw = raw[1:-1]
raw = raw.encode("raw_unicode_escape")
data["input"] = raw.decode("unicode_escape")
elif line.startswith("output:"):
raw = line[len("output:"):].strip()
try:
data["output"] = eval(raw, vars(tokens))
except Exception as err:
raise _TestParseError(err)
cls._parse_test(test, data)
except _TestParseError as err:
if data["name"]:
error = "Could not parse test '{0}' in '{1}':\n\t{2}"
@@ -97,6 +103,7 @@ class TokenizerTestCase(object):
error = "Could not parse a test in '{0}':\n\t{1}"
print(error.format(filename, err))
continue

if not data["name"]:
error = "A test in '{0}' was ignored because it lacked a name"
print(error.format(filename))
@@ -105,27 +112,35 @@ class TokenizerTestCase(object):
error = "Test '{0}' in '{1}' was ignored because it lacked an input or an output"
print(error.format(data["name"], filename))
continue

number = str(counter).zfill(digits)
counter += 1
if restrict and data["name"] != restrict:
continue

fname = "test_{0}{1}_{2}".format(name, number, data["name"])
meth = cls._build_test_method(fname, data)
setattr(cls, fname, meth)
counter += 1

@classmethod
def build(cls):
"""Load and install all tests from the 'tokenizer' directory."""
def load_file(filename):
def load_file(filename, restrict=None):
with codecs.open(filename, "rU", encoding="utf8") as fp:
text = fp.read()
name = path.split(filename)[1][:0-len(extension)]
cls._load_tests(filename, name, text)
name = path.split(filename)[1][:-len(extension)]
cls._load_tests(filename, name, text, restrict)

directory = path.join(path.dirname(__file__), "tokenizer")
extension = ".mwtest"
if len(sys.argv) > 2 and sys.argv[1] == "--use":
for name in sys.argv[2:]:
load_file(path.join(directory, name + extension))
sys.argv = [sys.argv[0]] # So unittest doesn't try to load these
if "." in name:
name, test = name.split(".", 1)
else:
test = None
load_file(path.join(directory, name + extension), test)
sys.argv = [sys.argv[0]] # So unittest doesn't try to parse this
cls.skip_others = True
else:
for filename in listdir(directory):


+ 20
- 12
tests/test_smart_list.py View File

@@ -52,6 +52,7 @@ class TestSmartList(unittest.TestCase):
self.assertEqual([0, 1, 2], list1[:3])
self.assertEqual([0, 1, 2, 3, "one", "two"], list1[:])
self.assertEqual([3, "one", "two"], list1[3:])
self.assertEqual([3, "one", "two"], list1[3:100])
self.assertEqual(["one", "two"], list1[-2:])
self.assertEqual([0, 1], list1[:-4])
self.assertEqual([], list1[6:])
@@ -389,28 +390,35 @@ class TestSmartList(unittest.TestCase):
self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
self.assertEqual([4, 3, 2, 1.9, 1.8], child2)

child1.detach()
self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], parent)
self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
child3 = parent[9:]
self.assertEqual([8, 8.1, 8.2], child3)

del parent[8:]
self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent)
self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6], child1)
self.assertEqual([4, 3, 2, 1.9, 1.8], child2)
self.assertEqual([], child3)

del child1
self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent)
self.assertEqual([4, 3, 2, 1.9, 1.8], child2)
self.assertEqual([], child3)
self.assertEqual(2, len(parent._children))

del child3
self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent)
self.assertEqual([4, 3, 2, 1.9, 1.8], child2)
self.assertEqual(1, len(parent._children))

parent.remove(1.9)
parent.remove(1.8)
self.assertEqual([1, 4, 3, 2, 5, 6, 7, 8, 8.1, 8.2], parent)
self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
self.assertEqual([1, 4, 3, 2, 5, 6], parent)
self.assertEqual([4, 3, 2], child2)

parent.reverse()
self.assertEqual([8.2, 8.1, 8, 7, 6, 5, 2, 3, 4, 1], parent)
self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
self.assertEqual([6, 5, 2, 3, 4, 1], parent)
self.assertEqual([4, 3, 2], child2)
self.assertEqual(0, len(parent._children))

child2.detach()
self.assertEqual([8.2, 8.1, 8, 7, 6, 5, 2, 3, 4, 1], parent)
self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
self.assertEqual([4, 3, 2], child2)

if __name__ == "__main__":
unittest.main(verbosity=2)

+ 1
- 1
tests/tokenizer/text.mwtest View File

@@ -27,6 +27,6 @@ output: [Text(text="𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰")]
---

name: large
label: a lot of text, requiring multiple textbuffer blocks in the C tokenizer
label: a lot of text, requiring proper storage in the C tokenizer
input: "ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN"
output: [Text(text="ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN")]

Loading…
Cancel
Save