Sfoglia il codice sorgente

Begin splitting up C tokenizer.

tags/v0.4.1
Ben Kurtovic 9 anni fa
parent
commit
0e547aa416
8 ha cambiato i file con 191 aggiunte e 104 eliminazioni
  1. +1
    -0
      CHANGELOG
  2. +1
    -0
      docs/changelog.rst
  3. +40
    -0
      mwparserfromhell/parser/ctokenizer/common.h
  4. +100
    -0
      mwparserfromhell/parser/ctokenizer/textbuffer.c
  5. +40
    -0
      mwparserfromhell/parser/ctokenizer/textbuffer.h
  6. +0
    -70
      mwparserfromhell/parser/ctokenizer/tokenizer.c
  7. +2
    -30
      mwparserfromhell/parser/ctokenizer/tokenizer.h
  8. +7
    -4
      setup.py

+ 1
- 0
CHANGELOG Vedi File

@@ -6,6 +6,7 @@ v0.4.1 (unreleased):
- Added support for Python 3.5.
- '<' and '>' are now disallowed in wikilink titles and template names. This
includes when denoting tags, but not comments.
- Heavy refactoring and fixes to the C tokenizer.
- Fixed some bugs in the release scripts.

v0.4 (released May 23, 2015):


+ 1
- 0
docs/changelog.rst Vedi File

@@ -13,6 +13,7 @@ Unreleased
- Added support for Python 3.5.
- ``<`` and ``>`` are now disallowed in wikilink titles and template names.
This includes when denoting tags, but not comments.
- Heavy refactoring and fixes to the C tokenizer.
- Fixed some bugs in the release scripts.

v0.4


+ 40
- 0
mwparserfromhell/parser/ctokenizer/common.h Vedi File

@@ -0,0 +1,40 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#ifndef PY_SSIZE_T_CLEAN
#define PY_SSIZE_T_CLEAN
#endif

#include <Python.h>
#include <structmember.h>
#include <bytesobject.h>

#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif

#ifndef uint64_t
#define uint64_t unsigned PY_LONG_LONG
#endif

#define malloc PyObject_Malloc
#define free PyObject_Free

+ 100
- 0
mwparserfromhell/parser/ctokenizer/textbuffer.c Vedi File

@@ -0,0 +1,100 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#include "textbuffer.h"

#define TEXTBUFFER_BLOCKSIZE 1024

/*
Create a new textbuffer object.
*/
Textbuffer* Textbuffer_new(void)
{
Textbuffer* buffer = malloc(sizeof(Textbuffer));

if (!buffer) {
PyErr_NoMemory();
return NULL;
}
buffer->size = 0;
buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
if (!buffer->data) {
free(buffer);
PyErr_NoMemory();
return NULL;
}
buffer->prev = buffer->next = NULL;
return buffer;
}

/*
Deallocate the given textbuffer.
*/
void Textbuffer_dealloc(Textbuffer* self)
{
Textbuffer* next;

while (self) {
free(self->data);
next = self->next;
free(self);
self = next;
}
}

/*
Write a Unicode codepoint to the given textbuffer.
*/
int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
{
Textbuffer* self = *this;

if (self->size == TEXTBUFFER_BLOCKSIZE) {
Textbuffer* new = Textbuffer_new();
if (!new)
return -1;
new->next = self;
self->prev = new;
*this = self = new;
}
self->data[self->size++] = code;
return 0;
}

/*
Return the contents of the textbuffer as a Python Unicode object.
*/
PyObject* Textbuffer_render(Textbuffer* self)
{
PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
PyObject *left, *concat;

while (self->next) {
self = self->next;
left = PyUnicode_FromUnicode(self->data, self->size);
concat = PyUnicode_Concat(left, result);
Py_DECREF(left);
Py_DECREF(result);
result = concat;
}
return result;
}

+ 40
- 0
mwparserfromhell/parser/ctokenizer/textbuffer.h Vedi File

@@ -0,0 +1,40 @@
/*
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#include "common.h"

/* Structs */

struct Textbuffer {
Py_ssize_t size;
Py_UNICODE* data;
struct Textbuffer* prev;
struct Textbuffer* next;
};
typedef struct Textbuffer Textbuffer;

/* Functions */

Textbuffer* Textbuffer_new(void);
void Textbuffer_dealloc(Textbuffer*);
int Textbuffer_write(Textbuffer**, Py_UNICODE);
PyObject* Textbuffer_render(Textbuffer*);

mwparserfromhell/parser/tokenizer.c → mwparserfromhell/parser/ctokenizer/tokenizer.c Vedi File

@@ -1,5 +1,4 @@
/*
Tokenizer for MWParserFromHell
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ -89,75 +88,6 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr)
return lowered;
}

static Textbuffer* Textbuffer_new(void)
{
Textbuffer* buffer = malloc(sizeof(Textbuffer));

if (!buffer) {
PyErr_NoMemory();
return NULL;
}
buffer->size = 0;
buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
if (!buffer->data) {
free(buffer);
PyErr_NoMemory();
return NULL;
}
buffer->prev = buffer->next = NULL;
return buffer;
}

static void Textbuffer_dealloc(Textbuffer* self)
{
Textbuffer* next;

while (self) {
free(self->data);
next = self->next;
free(self);
self = next;
}
}

/*
Write a Unicode codepoint to the given textbuffer.
*/
static int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
{
Textbuffer* self = *this;

if (self->size == TEXTBUFFER_BLOCKSIZE) {
Textbuffer* new = Textbuffer_new();
if (!new)
return -1;
new->next = self;
self->prev = new;
*this = self = new;
}
self->data[self->size++] = code;
return 0;
}

/*
Return the contents of the textbuffer as a Python Unicode object.
*/
static PyObject* Textbuffer_render(Textbuffer* self)
{
PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
PyObject *left, *concat;

while (self->next) {
self = self->next;
left = PyUnicode_FromUnicode(self->data, self->size);
concat = PyUnicode_Concat(left, result);
Py_DECREF(left);
Py_DECREF(result);
result = concat;
}
return result;
}

static TagData* TagData_new(void)
{
TagData *self = malloc(sizeof(TagData));

mwparserfromhell/parser/tokenizer.h → mwparserfromhell/parser/ctokenizer/tokenizer.h Vedi File

@@ -1,5 +1,4 @@
/*
Tokenizer Header File for MWParserFromHell
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ -21,25 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#ifndef PY_SSIZE_T_CLEAN
#define PY_SSIZE_T_CLEAN
#endif

#include <Python.h>
#include <math.h>
#include <structmember.h>
#include <bytesobject.h>

#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif

#ifndef uint64_t
#define uint64_t unsigned PY_LONG_LONG
#endif

#define malloc PyObject_Malloc
#define free PyObject_Free
#include "common.h"
#include "textbuffer.h"

#define DIGITS "0123456789"
#define HEXDIGITS "0123456789abcdefABCDEF"
@@ -50,7 +34,6 @@ static const char MARKERS[] = {
'-', '!', '\n', '\0'};

#define NUM_MARKERS 19
#define TEXTBUFFER_BLOCKSIZE 1024
#define MAX_DEPTH 40
#define MAX_CYCLES 100000
#define MAX_BRACES 255
@@ -196,13 +179,6 @@ static PyObject* TagCloseClose;

/* Miscellaneous structs: */

struct Textbuffer {
Py_ssize_t size;
Py_UNICODE* data;
struct Textbuffer* prev;
struct Textbuffer* next;
};

struct Stack {
PyObject* stack;
uint64_t context;
@@ -224,7 +200,6 @@ typedef struct {
Py_ssize_t reset;
} TagData;

typedef struct Textbuffer Textbuffer;
typedef struct Stack Stack;


@@ -268,9 +243,6 @@ typedef struct {

/* Function prototypes: */

static Textbuffer* Textbuffer_new(void);
static void Textbuffer_dealloc(Textbuffer*);

static TagData* TagData_new(void);
static void TagData_dealloc(TagData*);


+ 7
- 4
setup.py Vedi File

@@ -23,6 +23,7 @@

from __future__ import print_function
from distutils.errors import DistutilsError, CCompilerError
from glob import glob
from os import environ
import sys

@@ -39,10 +40,6 @@ from mwparserfromhell.compat import py26, py3k
with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp:
long_docs = fp.read()

tokenizer = Extension("mwparserfromhell.parser._tokenizer",
sources=["mwparserfromhell/parser/tokenizer.c"],
depends=["mwparserfromhell/parser/tokenizer.h"])

use_extension = True
fallback = True

@@ -75,6 +72,12 @@ def build_ext_patched(self):
if fallback:
build_ext.run, build_ext_original = build_ext_patched, build_ext.run

# Project-specific part begins here:

tokenizer = Extension("mwparserfromhell.parser._tokenizer",
sources=glob("mwparserfromhell/parser/ctokenizer/*.c"),
depends=glob("mwparserfromhell/parser/ctokenizer/*.h"))

setup(
name = "mwparserfromhell",
packages = find_packages(exclude=("tests",)),


Caricamento…
Annulla
Salva