From 74ab399ed012687dd1f86d17f40a1a369e396e02 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 8 Jun 2015 18:29:22 -0400 Subject: [PATCH 01/22] Try out appveyor config (#95) --- appveyor.yml | 56 ++++++++++++++++++++++++++++++++ scripts/README | 3 ++ scripts/release.sh | 7 ++-- scripts/win_build.py | 58 --------------------------------- scripts/win_install.ps1 | 85 +++++++++++++++++++++++++++++++++++++++++++++++++ scripts/win_wrapper.cmd | 47 +++++++++++++++++++++++++++ 6 files changed, 195 insertions(+), 61 deletions(-) create mode 100644 appveyor.yml create mode 100644 scripts/README delete mode 100644 scripts/win_build.py create mode 100644 scripts/win_install.ps1 create mode 100644 scripts/win_wrapper.cmd diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 0000000..c312c89 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,56 @@ +# This config file is used by appveyor.com to build Windows release binaries + +version: 0.4.1.dev0-b{build} + +branches: + only: + - master + +skip_tags: true + +environment: + global: + # See: http://stackoverflow.com/a/13751649/163740 + WRAPPER: "cmd /E:ON /V:ON /C .\\scripts\\win_wrapper.cmd" + + matrix: + - PYTHON: "C:\\Python27" + PYTHON_VERSION: "2.7.10" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python27-x64" + PYTHON_VERSION: "2.7.10" + PYTHON_ARCH: "64" + + - PYTHON: "C:\\Python33" + PYTHON_VERSION: "3.3.6" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python33-x64" + PYTHON_VERSION: "3.3.6" + PYTHON_ARCH: "64" + + - PYTHON: "C:\\Python34" + PYTHON_VERSION: "3.4.3" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python34-x64" + PYTHON_VERSION: "3.4.3" + PYTHON_ARCH: "64" + +install: + - "powershell scripts\\win_install.ps1" + +build_script: + - "%WRAPPER% %PYTHON%\\python setup.py build" + +test_script: + - "%WRAPPER% %PYTHON%\\python setup.py -q test" + +after_test: + - "%WRAPPER% %PYTHON%\\python setup.py bdist_wheel" + +artifacts: + - path: dist\* + +deploy: off diff --git a/scripts/README b/scripts/README new file mode 100644 index 0000000..eea9627 --- /dev/null +++ b/scripts/README @@ -0,0 +1,3 @@ +This directory contains support files used for *developing* mwparserfromhell, +not running it. If you are looking for code examples, read the documentation +or explore the source code. diff --git a/scripts/release.sh b/scripts/release.sh index dcd871c..7d79e8e 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -31,6 +31,8 @@ update_version() { echo " done." } +# TODO: update appveyor version! + update_changelog() { filename="CHANGELOG" echo -n "Updating $filename..." @@ -67,10 +69,9 @@ do_git_stuff() { } upload_to_pypi() { - # TODO: check whether these commands give output echo -n "PyPI: uploading source tarball and docs..." - python setup.py register sdist upload -s - python setup.py upload_docs + python setup.py -q register sdist upload -s + python setup.py -q upload_docs echo " done." } diff --git a/scripts/win_build.py b/scripts/win_build.py deleted file mode 100644 index 2d51909..0000000 --- a/scripts/win_build.py +++ /dev/null @@ -1,58 +0,0 @@ -# Build requirements: -# -# Python 2.6-3.2: Visual C++ Express Edition 2008: -# http://go.microsoft.com/?linkid=7729279 -# -# Python 3.3+: Visual C++ Express Edition 2010: -# http://go.microsoft.com/?linkid=9709949 -# -# x64 builds: Microsoft Windows SDK for Windows 7 and .NET Framework 3.5 SP1: -# http://www.microsoft.com/en-us/download/details.aspx?id=3138 -# -# Python interpreter, 2.6, 2.7, 3.2-3.4: -# https://www.python.org/downloads/ -# -# Pip, setuptools, wheel: -# https://bootstrap.pypa.io/get-pip.py -# and run *for each* Python version: -# c:\pythonXX\python get-pip.py -# c:\pythonXX\scripts\pip install wheel -# -# Afterwards, run this script with any of the python interpreters (2.7 suggested) - -from __future__ import print_function -import os -from subprocess import call, STDOUT - -ENVIRONMENTS = ["26", "27", "32", "33", "34"] - -def run(pyver, cmds): - cmd = [r"C:\Python%s\Python.exe" % pyver, "setup.py"] + cmds - print(" ".join(cmd), end=" ") - - with open("%s%s.log" % (cmds[0], pyver), "w") as logfile: - retval = call(cmd, stdout=logfile, stderr=STDOUT, cwd="..") - if not retval: - print("[OK]") - else: - print("[FAILED (%i)]" % retval) - return retval - -def main(): - path = os.path.split(__file__)[0] - if path: - os.chdir(path) - - print("Building Windows wheels for Python %s:" % ", ".join(ENVIRONMENTS)) - for pyver in ENVIRONMENTS: - print() - try: - os.unlink("mwparserfromhell/parser/_tokenizer.pyd") - except OSError: - pass - - if run(pyver, ["test"]) == 0: - run(pyver, ["bdist_wheel", "upload"]) # TODO: add "-s" to GPG sign - -if __name__ == "__main__": - main() diff --git a/scripts/win_install.ps1 b/scripts/win_install.ps1 new file mode 100644 index 0000000..bd76d11 --- /dev/null +++ b/scripts/win_install.ps1 @@ -0,0 +1,85 @@ +# Sample script to install Python and pip under Windows +# Authors: Olivier Grisel and Kyle Kastner +# License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ + +$BASE_URL = "https://www.python.org/ftp/python/" +$GET_PIP_URL = "https://bootstrap.pypa.io/get-pip.py" +$GET_PIP_PATH = "C:\get-pip.py" + + +function DownloadPython ($python_version, $platform_suffix) { + $webclient = New-Object System.Net.WebClient + $filename = "python-" + $python_version + $platform_suffix + ".msi" + $url = $BASE_URL + $python_version + "/" + $filename + + $basedir = $pwd.Path + "\" + $filepath = $basedir + $filename + if (Test-Path $filename) { + Write-Host "Reusing" $filepath + return $filepath + } + + # Download and retry up to 3 times in case of network transient errors. + Write-Host "Downloading" $filename "from" $url + $retry_attempts = 3 + for($i=0; $i -lt $retry_attempts; $i++){ + try { + $webclient.DownloadFile($url, $filepath) + break + } + Catch [Exception]{ + Start-Sleep 1 + } + } + Write-Host "File saved at" $filepath + return $filepath +} + + +function InstallPython ($python_version, $architecture, $python_home) { + Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home + if (Test-Path $python_home) { + Write-Host $python_home "already exists, skipping." + return $false + } + if ($architecture -eq "32") { + $platform_suffix = "" + } else { + $platform_suffix = ".amd64" + } + $filepath = DownloadPython $python_version $platform_suffix + Write-Host "Installing" $filepath "to" $python_home + $args = "/qn /i $filepath TARGETDIR=$python_home" + Write-Host "msiexec.exe" $args + Start-Process -FilePath "msiexec.exe" -ArgumentList $args -Wait -Passthru + Write-Host "Python $python_version ($architecture) installation complete" + return $true +} + + +function InstallPip ($python_home) { + $pip_path = $python_home + "/Scripts/pip.exe" + $python_path = $python_home + "/python.exe" + if (-not(Test-Path $pip_path)) { + Write-Host "Installing pip..." + $webclient = New-Object System.Net.WebClient + $webclient.DownloadFile($GET_PIP_URL, $GET_PIP_PATH) + Write-Host "Executing:" $python_path $GET_PIP_PATH + Start-Process -FilePath "$python_path" -ArgumentList "$GET_PIP_PATH" -Wait -Passthru + } else { + Write-Host "pip already installed." + } +} + +function InstallPackage ($python_home, $pkg) { + $pip_path = $python_home + "/Scripts/pip.exe" + & $pip_path install $pkg +} + +function main () { + InstallPython $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON + InstallPip $env:PYTHON + InstallPackage $env:PYTHON wheel +} + +main diff --git a/scripts/win_wrapper.cmd b/scripts/win_wrapper.cmd new file mode 100644 index 0000000..3a472bc --- /dev/null +++ b/scripts/win_wrapper.cmd @@ -0,0 +1,47 @@ +:: To build extensions for 64 bit Python 3, we need to configure environment +:: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of: +:: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1) +:: +:: To build extensions for 64 bit Python 2, we need to configure environment +:: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of: +:: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0) +:: +:: 32 bit builds do not require specific environment configurations. +:: +:: Note: this script needs to be run with the /E:ON and /V:ON flags for the +:: cmd interpreter, at least for (SDK v7.0) +:: +:: More details at: +:: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows +:: http://stackoverflow.com/a/13751649/163740 +:: +:: Author: Olivier Grisel +:: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ +@ECHO OFF + +SET COMMAND_TO_RUN=%* +SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows + +SET MAJOR_PYTHON_VERSION="%PYTHON_VERSION:~0,1%" +IF %MAJOR_PYTHON_VERSION% == "2" ( + SET WINDOWS_SDK_VERSION="v7.0" +) ELSE IF %MAJOR_PYTHON_VERSION% == "3" ( + SET WINDOWS_SDK_VERSION="v7.1" +) ELSE ( + ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" + EXIT 1 +) + +IF "%PYTHON_ARCH%"=="64" ( + ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture + SET DISTUTILS_USE_SDK=1 + SET MSSdk=1 + "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% + "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release + ECHO Executing: %COMMAND_TO_RUN% + call %COMMAND_TO_RUN% || EXIT 1 +) ELSE ( + ECHO Using default MSVC build environment for 32 bit architecture + ECHO Executing: %COMMAND_TO_RUN% + call %COMMAND_TO_RUN% || EXIT 1 +) From c7497ddf63bd5160a47aa407cbad7cc5a565cd2d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 10 Jun 2015 16:25:53 -0400 Subject: [PATCH 02/22] Updates to installation/release code. * Remove unnecessary win_install.ps1. * Add appveyor version update to release script. * Make setup.py's pure Python fallback code less intrusive. --- appveyor.yml | 10 +++--- scripts/release.sh | 8 ++++- scripts/win_install.ps1 | 85 ---------------------------------------------- setup.py | 90 ++++++++++++++++++------------------------------- 4 files changed, 46 insertions(+), 147 deletions(-) delete mode 100644 scripts/win_install.ps1 diff --git a/appveyor.yml b/appveyor.yml index c312c89..34201f2 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -12,6 +12,8 @@ environment: global: # See: http://stackoverflow.com/a/13751649/163740 WRAPPER: "cmd /E:ON /V:ON /C .\\scripts\\win_wrapper.cmd" + PIP: "%WRAPPER% %PYTHON%\\Scripts\\pip.exe" + SETUPPY: "%WRAPPER% %PYTHON%\\python setup.py --with-extension" matrix: - PYTHON: "C:\\Python27" @@ -39,16 +41,16 @@ environment: PYTHON_ARCH: "64" install: - - "powershell scripts\\win_install.ps1" + - "%PIP% install wheel" build_script: - - "%WRAPPER% %PYTHON%\\python setup.py build" + - "%SETUPPY% build" test_script: - - "%WRAPPER% %PYTHON%\\python setup.py -q test" + - "%SETUPPY% -q test" after_test: - - "%WRAPPER% %PYTHON%\\python setup.py bdist_wheel" + - "%SETUPPY% bdist_wheel" artifacts: - path: dist\* diff --git a/scripts/release.sh b/scripts/release.sh index 7d79e8e..c256c7c 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -31,7 +31,12 @@ update_version() { echo " done." } -# TODO: update appveyor version! +update_appveyor() { + filename="appveyor.yml" + echo -n "Updating $filename..." + sed -e "s/version: .*/version: $VERSION-b{build}/" -i "" $filename + echo " done." +} update_changelog() { filename="CHANGELOG" @@ -154,6 +159,7 @@ cd "$SCRIPT_DIR/.." check_git update_version +update_appveyor update_changelog update_docs_changelog do_git_stuff diff --git a/scripts/win_install.ps1 b/scripts/win_install.ps1 deleted file mode 100644 index bd76d11..0000000 --- a/scripts/win_install.ps1 +++ /dev/null @@ -1,85 +0,0 @@ -# Sample script to install Python and pip under Windows -# Authors: Olivier Grisel and Kyle Kastner -# License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ - -$BASE_URL = "https://www.python.org/ftp/python/" -$GET_PIP_URL = "https://bootstrap.pypa.io/get-pip.py" -$GET_PIP_PATH = "C:\get-pip.py" - - -function DownloadPython ($python_version, $platform_suffix) { - $webclient = New-Object System.Net.WebClient - $filename = "python-" + $python_version + $platform_suffix + ".msi" - $url = $BASE_URL + $python_version + "/" + $filename - - $basedir = $pwd.Path + "\" - $filepath = $basedir + $filename - if (Test-Path $filename) { - Write-Host "Reusing" $filepath - return $filepath - } - - # Download and retry up to 3 times in case of network transient errors. - Write-Host "Downloading" $filename "from" $url - $retry_attempts = 3 - for($i=0; $i -lt $retry_attempts; $i++){ - try { - $webclient.DownloadFile($url, $filepath) - break - } - Catch [Exception]{ - Start-Sleep 1 - } - } - Write-Host "File saved at" $filepath - return $filepath -} - - -function InstallPython ($python_version, $architecture, $python_home) { - Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home - if (Test-Path $python_home) { - Write-Host $python_home "already exists, skipping." - return $false - } - if ($architecture -eq "32") { - $platform_suffix = "" - } else { - $platform_suffix = ".amd64" - } - $filepath = DownloadPython $python_version $platform_suffix - Write-Host "Installing" $filepath "to" $python_home - $args = "/qn /i $filepath TARGETDIR=$python_home" - Write-Host "msiexec.exe" $args - Start-Process -FilePath "msiexec.exe" -ArgumentList $args -Wait -Passthru - Write-Host "Python $python_version ($architecture) installation complete" - return $true -} - - -function InstallPip ($python_home) { - $pip_path = $python_home + "/Scripts/pip.exe" - $python_path = $python_home + "/python.exe" - if (-not(Test-Path $pip_path)) { - Write-Host "Installing pip..." - $webclient = New-Object System.Net.WebClient - $webclient.DownloadFile($GET_PIP_URL, $GET_PIP_PATH) - Write-Host "Executing:" $python_path $GET_PIP_PATH - Start-Process -FilePath "$python_path" -ArgumentList "$GET_PIP_PATH" -Wait -Passthru - } else { - Write-Host "pip already installed." - } -} - -function InstallPackage ($python_home, $pkg) { - $pip_path = $python_home + "/Scripts/pip.exe" - & $pip_path install $pkg -} - -function main () { - InstallPython $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON - InstallPip $env:PYTHON - InstallPackage $env:PYTHON wheel -} - -main diff --git a/setup.py b/setup.py index e2744ef..dcdd563 100644 --- a/setup.py +++ b/setup.py @@ -21,17 +21,17 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import os +from __future__ import print_function +from distutils.errors import DistutilsError, CCompilerError +from os import environ import sys -if (sys.version_info[0] == 2 and sys.version_info[1] < 6) or \ - (sys.version_info[1] == 3 and sys.version_info[1] < 2): - raise Exception("mwparserfromhell needs Python 2.6+ or 3.2+") - -if sys.version_info >= (3, 0): - basestring = (str, ) +if ((sys.version_info[0] == 2 and sys.version_info[1] < 6) or + (sys.version_info[1] == 3 and sys.version_info[1] < 2)): + raise RuntimeError("mwparserfromhell needs Python 2.6+ or 3.2+") from setuptools import setup, find_packages, Extension +from setuptools.command.build_ext import build_ext from mwparserfromhell import __version__ from mwparserfromhell.compat import py26, py3k @@ -44,65 +44,41 @@ tokenizer = Extension("mwparserfromhell.parser._tokenizer", depends=["mwparserfromhell/parser/tokenizer.h"]) use_extension = True +fallback = True -# Allow env var WITHOUT_EXTENSION and args --with[out]-extension -if '--without-extension' in sys.argv: - use_extension = False -elif '--with-extension' in sys.argv: - pass -elif os.environ.get('WITHOUT_EXTENSION', '0') == '1': - use_extension = False - -# Remove the command line argument as it isn't understood by -# setuptools/distutils -sys.argv = [arg for arg in sys.argv - if not arg.startswith('--with') - and not arg.endswith('-extension')] - - -def optional_compile_setup(func=setup, use_ext=use_extension, - *args, **kwargs): - """ - Wrap setup to allow optional compilation of extensions. - - Falls back to pure python mode (no extensions) - if compilation of extensions fails. - """ - extensions = kwargs.get('ext_modules', None) +# Allow env var WITHOUT_EXTENSION and args --with[out]-extension: - if use_ext and extensions: - try: - func(*args, **kwargs) - return - except SystemExit as e: - assert(e.args) - if e.args[0] is False: - raise - elif isinstance(e.args[0], basestring): - if e.args[0].startswith('usage: '): - raise - else: - # Fallback to pure python mode - print('setup with extension failed: %s' % repr(e)) - pass - except Exception as e: - print('setup with extension failed: %s' % repr(e)) +env_var = environ.get("WITHOUT_EXTENSION") +if "--without-extension" in sys.argv: + use_extension = False +elif "--with-extension" in sys.argv: + fallback = False +elif env_var is not None: + if env_var == "1": + use_extension = False + elif env_var == "0": + fallback = False - if extensions: - if use_ext: - print('Falling back to pure python mode.') - else: - print('Using pure python mode.') +# Remove the command line argument as it isn't understood by setuptools: - del kwargs['ext_modules'] +sys.argv = [arg for arg in sys.argv + if arg != "--without-extension" and arg != "--with-extension"] - func(*args, **kwargs) +def build_ext_patched(self): + try: + build_ext_original(self) + except (DistutilsError, CCompilerError) as exc: + print("error: " + str(exc)) + print("Falling back to pure Python mode.") + del self.extensions[:] +if fallback: + build_ext.run, build_ext_original = build_ext_patched, build_ext.run -optional_compile_setup( +setup( name = "mwparserfromhell", packages = find_packages(exclude=("tests",)), - ext_modules = [tokenizer], + ext_modules = [tokenizer] if use_extension else [], tests_require = ["unittest2"] if py26 else [], test_suite = "tests.discover", version = __version__, From dad042bc2c637c05728b29881eca644745f1fc6b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 10 Jun 2015 22:42:57 -0400 Subject: [PATCH 03/22] Fix C warnings in MSVC. --- mwparserfromhell/parser/tokenizer.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index ec0315f..dd11d16 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -40,7 +40,7 @@ static int is_marker(Py_UNICODE this) /* Given a context, return the heading level encoded within it. */ -static int heading_level_from_context(int n) +static int heading_level_from_context(uint64_t n) { int level; @@ -177,7 +177,8 @@ static TagData* TagData_new(void) ALLOC_BUFFER(self->pad_first) ALLOC_BUFFER(self->pad_before_eq) ALLOC_BUFFER(self->pad_after_eq) - self->quoter = self->reset = 0; + self->quoter = 0; + self->reset = 0; return self; } @@ -444,7 +445,7 @@ static int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse) { Textbuffer *original = buffer; - long i; + Py_ssize_t i; if (reverse) { do { @@ -939,7 +940,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer; PyObject *scheme; Py_UNICODE chunk; - long i; + Py_ssize_t i; int slashes, j; if (!scheme_buffer) From a8c0ff3f290cf82da8e22cd0007d78adfed3c4b1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 11 Jun 2015 00:03:49 -0400 Subject: [PATCH 04/22] Remove stdint.h include for MSVC 2008. --- mwparserfromhell/parser/tokenizer.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 842e65d..102fecd 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -29,12 +29,15 @@ SOFTWARE. #include #include #include -#include #if PY_MAJOR_VERSION >= 3 #define IS_PY3K #endif +#ifndef uint64_t +#define uint64_t unsigned PY_LONG_LONG +#endif + #define malloc PyObject_Malloc #define free PyObject_Free From 3aa6bb891cdd59e7f0e4483bc3316a5612ab4989 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 11 Jun 2015 18:11:54 -0400 Subject: [PATCH 05/22] Point releases are unnecessary in appyveyor.yml. --- appveyor.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 34201f2..9d0d8c8 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -17,27 +17,27 @@ environment: matrix: - PYTHON: "C:\\Python27" - PYTHON_VERSION: "2.7.10" + PYTHON_VERSION: "2.7" PYTHON_ARCH: "32" - PYTHON: "C:\\Python27-x64" - PYTHON_VERSION: "2.7.10" + PYTHON_VERSION: "2.7" PYTHON_ARCH: "64" - PYTHON: "C:\\Python33" - PYTHON_VERSION: "3.3.6" + PYTHON_VERSION: "3.3" PYTHON_ARCH: "32" - PYTHON: "C:\\Python33-x64" - PYTHON_VERSION: "3.3.6" + PYTHON_VERSION: "3.3" PYTHON_ARCH: "64" - PYTHON: "C:\\Python34" - PYTHON_VERSION: "3.4.3" + PYTHON_VERSION: "3.4" PYTHON_ARCH: "32" - PYTHON: "C:\\Python34-x64" - PYTHON_VERSION: "3.4.3" + PYTHON_VERSION: "3.4" PYTHON_ARCH: "64" install: From efc571c5c0e18782f2514b39b9bf351c19fafce4 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 11 Jun 2015 21:45:34 -0400 Subject: [PATCH 06/22] Refactor _test_tokenizer; add syntax for running just one test. --- tests/_test_tokenizer.py | 67 +++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index 1cbbc3d..cacf166 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -42,8 +42,8 @@ class TokenizerTestCase(object): directory. """ - @classmethod - def _build_test_method(cls, funcname, data): + @staticmethod + def _build_test_method(funcname, data): """Create and return a method to be treated as a test case method. *data* is a dict containing multiple keys: the *input* text to be @@ -58,13 +58,35 @@ class TokenizerTestCase(object): expected = data["output"] actual = self.tokenizer().tokenize(data["input"]) self.assertEqual(expected, actual) + if not py3k: inner.__name__ = funcname.encode("utf8") inner.__doc__ = data["label"] return inner + @staticmethod + def _parse_test(test, data): + """Parse an individual *test*, storing its info in *data*.""" + for line in test.strip().splitlines(): + if line.startswith("name:"): + data["name"] = line[len("name:"):].strip() + elif line.startswith("label:"): + data["label"] = line[len("label:"):].strip() + elif line.startswith("input:"): + raw = line[len("input:"):].strip() + if raw[0] == '"' and raw[-1] == '"': + raw = raw[1:-1] + raw = raw.encode("raw_unicode_escape") + data["input"] = raw.decode("unicode_escape") + elif line.startswith("output:"): + raw = line[len("output:"):].strip() + try: + data["output"] = eval(raw, vars(tokens)) + except Exception as err: + raise _TestParseError(err) + @classmethod - def _load_tests(cls, filename, name, text): + def _load_tests(cls, filename, name, text, restrict=None): """Load all tests in *text* from the file *filename*.""" tests = text.split("\n---\n") counter = 1 @@ -72,23 +94,7 @@ class TokenizerTestCase(object): for test in tests: data = {"name": None, "label": None, "input": None, "output": None} try: - for line in test.strip().splitlines(): - if line.startswith("name:"): - data["name"] = line[len("name:"):].strip() - elif line.startswith("label:"): - data["label"] = line[len("label:"):].strip() - elif line.startswith("input:"): - raw = line[len("input:"):].strip() - if raw[0] == '"' and raw[-1] == '"': - raw = raw[1:-1] - raw = raw.encode("raw_unicode_escape") - data["input"] = raw.decode("unicode_escape") - elif line.startswith("output:"): - raw = line[len("output:"):].strip() - try: - data["output"] = eval(raw, vars(tokens)) - except Exception as err: - raise _TestParseError(err) + cls._parse_test(test, data) except _TestParseError as err: if data["name"]: error = "Could not parse test '{0}' in '{1}':\n\t{2}" @@ -97,6 +103,7 @@ class TokenizerTestCase(object): error = "Could not parse a test in '{0}':\n\t{1}" print(error.format(filename, err)) continue + if not data["name"]: error = "A test in '{0}' was ignored because it lacked a name" print(error.format(filename)) @@ -105,27 +112,35 @@ class TokenizerTestCase(object): error = "Test '{0}' in '{1}' was ignored because it lacked an input or an output" print(error.format(data["name"], filename)) continue + number = str(counter).zfill(digits) + counter += 1 + if restrict and data["name"] != restrict: + continue + fname = "test_{0}{1}_{2}".format(name, number, data["name"]) meth = cls._build_test_method(fname, data) setattr(cls, fname, meth) - counter += 1 @classmethod def build(cls): """Load and install all tests from the 'tokenizer' directory.""" - def load_file(filename): + def load_file(filename, restrict=None): with codecs.open(filename, "rU", encoding="utf8") as fp: text = fp.read() - name = path.split(filename)[1][:0-len(extension)] - cls._load_tests(filename, name, text) + name = path.split(filename)[1][:-len(extension)] + cls._load_tests(filename, name, text, restrict) directory = path.join(path.dirname(__file__), "tokenizer") extension = ".mwtest" if len(sys.argv) > 2 and sys.argv[1] == "--use": for name in sys.argv[2:]: - load_file(path.join(directory, name + extension)) - sys.argv = [sys.argv[0]] # So unittest doesn't try to load these + if "." in name: + name, test = name.split(".", 1) + else: + test = None + load_file(path.join(directory, name + extension), test) + sys.argv = [sys.argv[0]] # So unittest doesn't try to parse this cls.skip_others = True else: for filename in listdir(directory): From 0e547aa416f76970fc09092f110e3367bced99fd Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 14 Jun 2015 17:40:10 -0400 Subject: [PATCH 07/22] Begin splitting up C tokenizer. --- CHANGELOG | 1 + docs/changelog.rst | 1 + mwparserfromhell/parser/ctokenizer/common.h | 40 +++++++++ mwparserfromhell/parser/ctokenizer/textbuffer.c | 100 +++++++++++++++++++++ mwparserfromhell/parser/ctokenizer/textbuffer.h | 40 +++++++++ .../parser/{ => ctokenizer}/tokenizer.c | 70 --------------- .../parser/{ => ctokenizer}/tokenizer.h | 32 +------ setup.py | 11 ++- 8 files changed, 191 insertions(+), 104 deletions(-) create mode 100644 mwparserfromhell/parser/ctokenizer/common.h create mode 100644 mwparserfromhell/parser/ctokenizer/textbuffer.c create mode 100644 mwparserfromhell/parser/ctokenizer/textbuffer.h rename mwparserfromhell/parser/{ => ctokenizer}/tokenizer.c (98%) rename mwparserfromhell/parser/{ => ctokenizer}/tokenizer.h (95%) diff --git a/CHANGELOG b/CHANGELOG index c49aaf7..7ad2930 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,7 @@ v0.4.1 (unreleased): - Added support for Python 3.5. - '<' and '>' are now disallowed in wikilink titles and template names. This includes when denoting tags, but not comments. +- Heavy refactoring and fixes to the C tokenizer. - Fixed some bugs in the release scripts. v0.4 (released May 23, 2015): diff --git a/docs/changelog.rst b/docs/changelog.rst index 3217a35..2944992 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -13,6 +13,7 @@ Unreleased - Added support for Python 3.5. - ``<`` and ``>`` are now disallowed in wikilink titles and template names. This includes when denoting tags, but not comments. +- Heavy refactoring and fixes to the C tokenizer. - Fixed some bugs in the release scripts. v0.4 diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h new file mode 100644 index 0000000..2ed5a02 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -0,0 +1,40 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef PY_SSIZE_T_CLEAN +#define PY_SSIZE_T_CLEAN +#endif + +#include +#include +#include + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +#ifndef uint64_t +#define uint64_t unsigned PY_LONG_LONG +#endif + +#define malloc PyObject_Malloc +#define free PyObject_Free diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.c b/mwparserfromhell/parser/ctokenizer/textbuffer.c new file mode 100644 index 0000000..63d45d6 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c @@ -0,0 +1,100 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "textbuffer.h" + +#define TEXTBUFFER_BLOCKSIZE 1024 + +/* + Create a new textbuffer object. +*/ +Textbuffer* Textbuffer_new(void) +{ + Textbuffer* buffer = malloc(sizeof(Textbuffer)); + + if (!buffer) { + PyErr_NoMemory(); + return NULL; + } + buffer->size = 0; + buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE); + if (!buffer->data) { + free(buffer); + PyErr_NoMemory(); + return NULL; + } + buffer->prev = buffer->next = NULL; + return buffer; +} + +/* + Deallocate the given textbuffer. +*/ +void Textbuffer_dealloc(Textbuffer* self) +{ + Textbuffer* next; + + while (self) { + free(self->data); + next = self->next; + free(self); + self = next; + } +} + +/* + Write a Unicode codepoint to the given textbuffer. +*/ +int Textbuffer_write(Textbuffer** this, Py_UNICODE code) +{ + Textbuffer* self = *this; + + if (self->size == TEXTBUFFER_BLOCKSIZE) { + Textbuffer* new = Textbuffer_new(); + if (!new) + return -1; + new->next = self; + self->prev = new; + *this = self = new; + } + self->data[self->size++] = code; + return 0; +} + +/* + Return the contents of the textbuffer as a Python Unicode object. +*/ +PyObject* Textbuffer_render(Textbuffer* self) +{ + PyObject *result = PyUnicode_FromUnicode(self->data, self->size); + PyObject *left, *concat; + + while (self->next) { + self = self->next; + left = PyUnicode_FromUnicode(self->data, self->size); + concat = PyUnicode_Concat(left, result); + Py_DECREF(left); + Py_DECREF(result); + result = concat; + } + return result; +} diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.h b/mwparserfromhell/parser/ctokenizer/textbuffer.h new file mode 100644 index 0000000..36b2207 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h @@ -0,0 +1,40 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "common.h" + +/* Structs */ + +struct Textbuffer { + Py_ssize_t size; + Py_UNICODE* data; + struct Textbuffer* prev; + struct Textbuffer* next; +}; +typedef struct Textbuffer Textbuffer; + +/* Functions */ + +Textbuffer* Textbuffer_new(void); +void Textbuffer_dealloc(Textbuffer*); +int Textbuffer_write(Textbuffer**, Py_UNICODE); +PyObject* Textbuffer_render(Textbuffer*); diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c similarity index 98% rename from mwparserfromhell/parser/tokenizer.c rename to mwparserfromhell/parser/ctokenizer/tokenizer.c index dd11d16..2bce247 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c @@ -1,5 +1,4 @@ /* -Tokenizer for MWParserFromHell Copyright (C) 2012-2015 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of @@ -89,75 +88,6 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr) return lowered; } -static Textbuffer* Textbuffer_new(void) -{ - Textbuffer* buffer = malloc(sizeof(Textbuffer)); - - if (!buffer) { - PyErr_NoMemory(); - return NULL; - } - buffer->size = 0; - buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE); - if (!buffer->data) { - free(buffer); - PyErr_NoMemory(); - return NULL; - } - buffer->prev = buffer->next = NULL; - return buffer; -} - -static void Textbuffer_dealloc(Textbuffer* self) -{ - Textbuffer* next; - - while (self) { - free(self->data); - next = self->next; - free(self); - self = next; - } -} - -/* - Write a Unicode codepoint to the given textbuffer. -*/ -static int Textbuffer_write(Textbuffer** this, Py_UNICODE code) -{ - Textbuffer* self = *this; - - if (self->size == TEXTBUFFER_BLOCKSIZE) { - Textbuffer* new = Textbuffer_new(); - if (!new) - return -1; - new->next = self; - self->prev = new; - *this = self = new; - } - self->data[self->size++] = code; - return 0; -} - -/* - Return the contents of the textbuffer as a Python Unicode object. -*/ -static PyObject* Textbuffer_render(Textbuffer* self) -{ - PyObject *result = PyUnicode_FromUnicode(self->data, self->size); - PyObject *left, *concat; - - while (self->next) { - self = self->next; - left = PyUnicode_FromUnicode(self->data, self->size); - concat = PyUnicode_Concat(left, result); - Py_DECREF(left); - Py_DECREF(result); - result = concat; - } - return result; -} - static TagData* TagData_new(void) { TagData *self = malloc(sizeof(TagData)); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/ctokenizer/tokenizer.h similarity index 95% rename from mwparserfromhell/parser/tokenizer.h rename to mwparserfromhell/parser/ctokenizer/tokenizer.h index 102fecd..66f1e90 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/ctokenizer/tokenizer.h @@ -1,5 +1,4 @@ /* -Tokenizer Header File for MWParserFromHell Copyright (C) 2012-2015 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of @@ -21,25 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef PY_SSIZE_T_CLEAN -#define PY_SSIZE_T_CLEAN -#endif - -#include #include -#include -#include -#if PY_MAJOR_VERSION >= 3 -#define IS_PY3K -#endif - -#ifndef uint64_t -#define uint64_t unsigned PY_LONG_LONG -#endif - -#define malloc PyObject_Malloc -#define free PyObject_Free +#include "common.h" +#include "textbuffer.h" #define DIGITS "0123456789" #define HEXDIGITS "0123456789abcdefABCDEF" @@ -50,7 +34,6 @@ static const char MARKERS[] = { '-', '!', '\n', '\0'}; #define NUM_MARKERS 19 -#define TEXTBUFFER_BLOCKSIZE 1024 #define MAX_DEPTH 40 #define MAX_CYCLES 100000 #define MAX_BRACES 255 @@ -196,13 +179,6 @@ static PyObject* TagCloseClose; /* Miscellaneous structs: */ -struct Textbuffer { - Py_ssize_t size; - Py_UNICODE* data; - struct Textbuffer* prev; - struct Textbuffer* next; -}; - struct Stack { PyObject* stack; uint64_t context; @@ -224,7 +200,6 @@ typedef struct { Py_ssize_t reset; } TagData; -typedef struct Textbuffer Textbuffer; typedef struct Stack Stack; @@ -268,9 +243,6 @@ typedef struct { /* Function prototypes: */ -static Textbuffer* Textbuffer_new(void); -static void Textbuffer_dealloc(Textbuffer*); - static TagData* TagData_new(void); static void TagData_dealloc(TagData*); diff --git a/setup.py b/setup.py index dcdd563..1bca436 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ from __future__ import print_function from distutils.errors import DistutilsError, CCompilerError +from glob import glob from os import environ import sys @@ -39,10 +40,6 @@ from mwparserfromhell.compat import py26, py3k with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp: long_docs = fp.read() -tokenizer = Extension("mwparserfromhell.parser._tokenizer", - sources=["mwparserfromhell/parser/tokenizer.c"], - depends=["mwparserfromhell/parser/tokenizer.h"]) - use_extension = True fallback = True @@ -75,6 +72,12 @@ def build_ext_patched(self): if fallback: build_ext.run, build_ext_original = build_ext_patched, build_ext.run +# Project-specific part begins here: + +tokenizer = Extension("mwparserfromhell.parser._tokenizer", + sources=glob("mwparserfromhell/parser/ctokenizer/*.c"), + depends=glob("mwparserfromhell/parser/ctokenizer/*.h")) + setup( name = "mwparserfromhell", packages = find_packages(exclude=("tests",)), From 2005efd309ecb21e658a7fa7c0739efed54c27ec Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 15 Jun 2015 00:05:28 -0400 Subject: [PATCH 08/22] Split up C tokenizer into tag_data, tok_parse, tok_support, tokens. --- mwparserfromhell/parser/ctokenizer/common.h | 56 +- mwparserfromhell/parser/ctokenizer/contexts.h | 104 + mwparserfromhell/parser/ctokenizer/tag_data.c | 88 + mwparserfromhell/parser/ctokenizer/tag_data.h | 43 + mwparserfromhell/parser/ctokenizer/textbuffer.h | 12 +- mwparserfromhell/parser/ctokenizer/tok_parse.c | 2750 +++++++++++++++++++ mwparserfromhell/parser/ctokenizer/tok_parse.h | 29 + mwparserfromhell/parser/ctokenizer/tok_support.c | 362 +++ mwparserfromhell/parser/ctokenizer/tok_support.h | 66 + mwparserfromhell/parser/ctokenizer/tokenizer.c | 3162 +--------------------- mwparserfromhell/parser/ctokenizer/tokenizer.h | 238 +- mwparserfromhell/parser/ctokenizer/tokens.c | 111 + mwparserfromhell/parser/ctokenizer/tokens.h | 69 + 13 files changed, 3722 insertions(+), 3368 deletions(-) create mode 100644 mwparserfromhell/parser/ctokenizer/contexts.h create mode 100644 mwparserfromhell/parser/ctokenizer/tag_data.c create mode 100644 mwparserfromhell/parser/ctokenizer/tag_data.h create mode 100644 mwparserfromhell/parser/ctokenizer/tok_parse.c create mode 100644 mwparserfromhell/parser/ctokenizer/tok_parse.h create mode 100644 mwparserfromhell/parser/ctokenizer/tok_support.c create mode 100644 mwparserfromhell/parser/ctokenizer/tok_support.h create mode 100644 mwparserfromhell/parser/ctokenizer/tokens.c create mode 100644 mwparserfromhell/parser/ctokenizer/tokens.h diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h index 2ed5a02..58c9487 100644 --- a/mwparserfromhell/parser/ctokenizer/common.h +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -20,14 +20,18 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#pragma once + #ifndef PY_SSIZE_T_CLEAN -#define PY_SSIZE_T_CLEAN +#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/2/c-api/arg.html #endif #include #include #include +/* Compatibility macros */ + #if PY_MAJOR_VERSION >= 3 #define IS_PY3K #endif @@ -36,5 +40,53 @@ SOFTWARE. #define uint64_t unsigned PY_LONG_LONG #endif -#define malloc PyObject_Malloc +#define malloc PyObject_Malloc // XXX: yuck #define free PyObject_Free + +/* Error handling globals/macros */ + +extern int route_state; // TODO: this is NOT thread-safe! +extern uint64_t route_context; + +#define BAD_ROUTE route_state +#define BAD_ROUTE_CONTEXT route_context +#define FAIL_ROUTE(context) { route_state = 1; route_context = context; } +#define RESET_ROUTE() route_state = 0 + +/* Shared globals */ + +extern char** entitydefs; + +extern PyObject* EMPTY; +extern PyObject* NOARGS; +extern PyObject* definitions; + +/* Structs */ + +struct Textbuffer { + Py_ssize_t size; + Py_UNICODE* data; + struct Textbuffer* prev; + struct Textbuffer* next; +}; +typedef struct Textbuffer Textbuffer; + +struct Stack { + PyObject* stack; + uint64_t context; + struct Textbuffer* textbuffer; + struct Stack* next; +}; +typedef struct Stack Stack; + +typedef struct { + PyObject_HEAD + PyObject* text; /* text to tokenize */ + Stack* topstack; /* topmost stack */ + Py_ssize_t head; /* current position in text */ + Py_ssize_t length; /* length of text */ + int global; /* global context */ + int depth; /* stack recursion depth */ + int cycles; /* total number of stack recursions */ + int skip_style_tags; /* temporary fix for the sometimes broken tag parser */ +} Tokenizer; diff --git a/mwparserfromhell/parser/ctokenizer/contexts.h b/mwparserfromhell/parser/ctokenizer/contexts.h new file mode 100644 index 0000000..8e24372 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/contexts.h @@ -0,0 +1,104 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +/* Local contexts */ + +#define LC_TEMPLATE 0x0000000000000007 +#define LC_TEMPLATE_NAME 0x0000000000000001 +#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002 +#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004 + +#define LC_ARGUMENT 0x0000000000000018 +#define LC_ARGUMENT_NAME 0x0000000000000008 +#define LC_ARGUMENT_DEFAULT 0x0000000000000010 + +#define LC_WIKILINK 0x0000000000000060 +#define LC_WIKILINK_TITLE 0x0000000000000020 +#define LC_WIKILINK_TEXT 0x0000000000000040 + +#define LC_EXT_LINK 0x0000000000000180 +#define LC_EXT_LINK_URI 0x0000000000000080 +#define LC_EXT_LINK_TITLE 0x0000000000000100 + +#define LC_HEADING 0x0000000000007E00 +#define LC_HEADING_LEVEL_1 0x0000000000000200 +#define LC_HEADING_LEVEL_2 0x0000000000000400 +#define LC_HEADING_LEVEL_3 0x0000000000000800 +#define LC_HEADING_LEVEL_4 0x0000000000001000 +#define LC_HEADING_LEVEL_5 0x0000000000002000 +#define LC_HEADING_LEVEL_6 0x0000000000004000 + +#define LC_TAG 0x0000000000078000 +#define LC_TAG_OPEN 0x0000000000008000 +#define LC_TAG_ATTR 0x0000000000010000 +#define LC_TAG_BODY 0x0000000000020000 +#define LC_TAG_CLOSE 0x0000000000040000 + +#define LC_STYLE 0x0000000000780000 +#define LC_STYLE_ITALICS 0x0000000000080000 +#define LC_STYLE_BOLD 0x0000000000100000 +#define LC_STYLE_PASS_AGAIN 0x0000000000200000 +#define LC_STYLE_SECOND_PASS 0x0000000000400000 + +#define LC_DLTERM 0x0000000000800000 + +#define LC_SAFETY_CHECK 0x000000003F000000 +#define LC_HAS_TEXT 0x0000000001000000 +#define LC_FAIL_ON_TEXT 0x0000000002000000 +#define LC_FAIL_NEXT 0x0000000004000000 +#define LC_FAIL_ON_LBRACE 0x0000000008000000 +#define LC_FAIL_ON_RBRACE 0x0000000010000000 +#define LC_FAIL_ON_EQUALS 0x0000000020000000 + +#define LC_TABLE 0x0000000FC0000000 +#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000 +#define LC_TABLE_OPEN 0x0000000040000000 +#define LC_TABLE_CELL_OPEN 0x0000000080000000 +#define LC_TABLE_CELL_STYLE 0x0000000100000000 +#define LC_TABLE_ROW_OPEN 0x0000000200000000 +#define LC_TABLE_TD_LINE 0x0000000400000000 +#define LC_TABLE_TH_LINE 0x0000000800000000 + +/* Global contexts */ + +#define GL_HEADING 0x1 + +/* Aggregate contexts */ + +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) +#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) +#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) +#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) +#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) + +/* Tag contexts */ + +#define TAG_NAME 0x01 +#define TAG_ATTR_READY 0x02 +#define TAG_ATTR_NAME 0x04 +#define TAG_ATTR_VALUE 0x08 +#define TAG_QUOTED 0x10 +#define TAG_NOTE_SPACE 0x20 +#define TAG_NOTE_EQUALS 0x40 +#define TAG_NOTE_QUOTE 0x80 diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.c b/mwparserfromhell/parser/ctokenizer/tag_data.c new file mode 100644 index 0000000..968a760 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/tag_data.c @@ -0,0 +1,88 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "tag_data.h" +#include "contexts.h" + +/* + Initialize a new TagData object. +*/ +TagData* TagData_new(void) +{ +#define ALLOC_BUFFER(name) \ + name = Textbuffer_new(); \ + if (!name) { \ + TagData_dealloc(self); \ + return NULL; \ + } + + TagData *self = malloc(sizeof(TagData)); + if (!self) { + PyErr_NoMemory(); + return NULL; + } + self->context = TAG_NAME; + ALLOC_BUFFER(self->pad_first) + ALLOC_BUFFER(self->pad_before_eq) + ALLOC_BUFFER(self->pad_after_eq) + self->quoter = 0; + self->reset = 0; + return self; + +#undef ALLOC_BUFFER +} + +/* + Deallocate the given TagData object. +*/ +void TagData_dealloc(TagData* self) +{ +#define DEALLOC_BUFFER(name) \ + if (name) \ + Textbuffer_dealloc(name); + + DEALLOC_BUFFER(self->pad_first); + DEALLOC_BUFFER(self->pad_before_eq); + DEALLOC_BUFFER(self->pad_after_eq); + free(self); + +#undef DEALLOC_BUFFER +} + +/* + Clear the internal buffers of the given TagData object. +*/ +int TagData_reset_buffers(TagData* self) +{ +#define RESET_BUFFER(name) \ + Textbuffer_dealloc(name); \ + name = Textbuffer_new(); \ + if (!name) \ + return -1; + + RESET_BUFFER(self->pad_first) + RESET_BUFFER(self->pad_before_eq) + RESET_BUFFER(self->pad_after_eq) + return 0; + +#undef RESET_BUFFER +} diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.h b/mwparserfromhell/parser/ctokenizer/tag_data.h new file mode 100644 index 0000000..e2ae807 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/tag_data.h @@ -0,0 +1,43 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +#include "common.h" +#include "textbuffer.h" + +/* Structs */ + +typedef struct { + uint64_t context; + Textbuffer* pad_first; + Textbuffer* pad_before_eq; + Textbuffer* pad_after_eq; + Py_UNICODE quoter; + Py_ssize_t reset; +} TagData; + +/* Functions */ + +TagData* TagData_new(void); +void TagData_dealloc(TagData*); +int TagData_reset_buffers(TagData*); diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.h b/mwparserfromhell/parser/ctokenizer/textbuffer.h index 36b2207..389a9fe 100644 --- a/mwparserfromhell/parser/ctokenizer/textbuffer.h +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h @@ -20,17 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "common.h" - -/* Structs */ +#pragma once -struct Textbuffer { - Py_ssize_t size; - Py_UNICODE* data; - struct Textbuffer* prev; - struct Textbuffer* next; -}; -typedef struct Textbuffer Textbuffer; +#include "common.h" /* Functions */ diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c new file mode 100644 index 0000000..1e6424d --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -0,0 +1,2750 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "tok_parse.h" +#include "contexts.h" +#include "tag_data.h" +#include "tok_support.h" +#include "tokens.h" + +#define DIGITS "0123456789" +#define HEXDIGITS "0123456789abcdefABCDEF" +#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + +static const char MARKERS[] = { + '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', + '-', '!', '\n', '\0'}; + +#define NUM_MARKERS 19 +#define MAX_BRACES 255 +#define MAX_ENTITY_SIZE 8 + +#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li") +#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL)) +#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL)) +#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL)) +#define IS_SCHEME(scheme, slashes, reverse) \ + (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False)) + +#ifdef IS_PY3K + #define NEW_INT_FUNC PyLong_FromSsize_t +#else + #define NEW_INT_FUNC PyInt_FromSsize_t +#endif + +typedef struct { + PyObject* title; + int level; +} HeadingData; + +/* Forward declarations */ + +static int Tokenizer_parse_entity(Tokenizer*); +static int Tokenizer_parse_comment(Tokenizer*); +static int Tokenizer_handle_dl_term(Tokenizer*); +static int Tokenizer_parse_tag(Tokenizer*); + +/* + Determine whether the given Py_UNICODE is a marker. +*/ +static int is_marker(Py_UNICODE this) +{ + int i; + + for (i = 0; i < NUM_MARKERS; i++) { + if (MARKERS[i] == this) + return 1; + } + return 0; +} + +/* + Given a context, return the heading level encoded within it. +*/ +static int heading_level_from_context(uint64_t n) +{ + int level; + + n /= LC_HEADING_LEVEL_1; + for (level = 1; n > 1; n >>= 1) + level++; + return level; +} + +/* + Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as + parameters, and return its output as a bool. +*/ +static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2, + PyObject* in3) +{ + PyObject* func = PyObject_GetAttrString(definitions, funcname); + PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL); + int ans = (result == Py_True) ? 1 : 0; + + Py_DECREF(func); + Py_DECREF(result); + return ans; +} + +/* + Sanitize the name of a tag so it can be compared with others for equality. +*/ +static PyObject* strip_tag_name(PyObject* token, int take_attr) +{ + PyObject *text, *rstripped, *lowered; + + if (take_attr) { + text = PyObject_GetAttrString(token, "text"); + if (!text) + return NULL; + rstripped = PyObject_CallMethod(text, "rstrip", NULL); + Py_DECREF(text); + } + else + rstripped = PyObject_CallMethod(token, "rstrip", NULL); + if (!rstripped) + return NULL; + lowered = PyObject_CallMethod(rstripped, "lower", NULL); + Py_DECREF(rstripped); + return lowered; +} + +/* + Parse a template at the head of the wikicode string. +*/ +static int Tokenizer_parse_template(Tokenizer* self) +{ + PyObject *template; + Py_ssize_t reset = self->head; + + template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1); + if (BAD_ROUTE) { + self->head = reset; + return 0; + } + if (!template) + return -1; + if (Tokenizer_emit_first(self, TemplateOpen)) { + Py_DECREF(template); + return -1; + } + if (Tokenizer_emit_all(self, template)) { + Py_DECREF(template); + return -1; + } + Py_DECREF(template); + if (Tokenizer_emit(self, TemplateClose)) + return -1; + return 0; +} + +/* + Parse an argument at the head of the wikicode string. +*/ +static int Tokenizer_parse_argument(Tokenizer* self) +{ + PyObject *argument; + Py_ssize_t reset = self->head; + + argument = Tokenizer_parse(self, LC_ARGUMENT_NAME, 1); + if (BAD_ROUTE) { + self->head = reset; + return 0; + } + if (!argument) + return -1; + if (Tokenizer_emit_first(self, ArgumentOpen)) { + Py_DECREF(argument); + return -1; + } + if (Tokenizer_emit_all(self, argument)) { + Py_DECREF(argument); + return -1; + } + Py_DECREF(argument); + if (Tokenizer_emit(self, ArgumentClose)) + return -1; + return 0; +} + +/* + Parse a template or argument at the head of the wikicode string. +*/ +static int Tokenizer_parse_template_or_argument(Tokenizer* self) +{ + unsigned int braces = 2, i; + PyObject *tokenlist; + + self->head += 2; + while (Tokenizer_READ(self, 0) == '{' && braces < MAX_BRACES) { + self->head++; + braces++; + } + if (Tokenizer_push(self, 0)) + return -1; + while (braces) { + if (braces == 1) { + if (Tokenizer_emit_text_then_stack(self, "{")) + return -1; + return 0; + } + if (braces == 2) { + if (Tokenizer_parse_template(self)) + return -1; + if (BAD_ROUTE) { + RESET_ROUTE(); + if (Tokenizer_emit_text_then_stack(self, "{{")) + return -1; + return 0; + } + break; + } + if (Tokenizer_parse_argument(self)) + return -1; + if (BAD_ROUTE) { + RESET_ROUTE(); + if (Tokenizer_parse_template(self)) + return -1; + if (BAD_ROUTE) { + char text[MAX_BRACES + 1]; + RESET_ROUTE(); + for (i = 0; i < braces; i++) text[i] = '{'; + text[braces] = '\0'; + if (Tokenizer_emit_text_then_stack(self, text)) + return -1; + return 0; + } + else + braces -= 2; + } + else + braces -= 3; + if (braces) + self->head++; + } + tokenlist = Tokenizer_pop(self); + if (!tokenlist) + return -1; + if (Tokenizer_emit_all(self, tokenlist)) { + Py_DECREF(tokenlist); + return -1; + } + Py_DECREF(tokenlist); + if (self->topstack->context & LC_FAIL_NEXT) + self->topstack->context ^= LC_FAIL_NEXT; + return 0; +} + +/* + Handle a template parameter at the head of the string. +*/ +static int Tokenizer_handle_template_param(Tokenizer* self) +{ + PyObject *stack; + + if (self->topstack->context & LC_TEMPLATE_NAME) + self->topstack->context ^= LC_TEMPLATE_NAME; + else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE) + self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE; + if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { + stack = Tokenizer_pop_keeping_context(self); + if (!stack) + return -1; + if (Tokenizer_emit_all(self, stack)) { + Py_DECREF(stack); + return -1; + } + Py_DECREF(stack); + } + else + self->topstack->context |= LC_TEMPLATE_PARAM_KEY; + if (Tokenizer_emit(self, TemplateParamSeparator)) + return -1; + if (Tokenizer_push(self, self->topstack->context)) + return -1; + return 0; +} + +/* + Handle a template parameter's value at the head of the string. +*/ +static int Tokenizer_handle_template_param_value(Tokenizer* self) +{ + PyObject *stack; + + stack = Tokenizer_pop_keeping_context(self); + if (!stack) + return -1; + if (Tokenizer_emit_all(self, stack)) { + Py_DECREF(stack); + return -1; + } + Py_DECREF(stack); + self->topstack->context ^= LC_TEMPLATE_PARAM_KEY; + self->topstack->context |= LC_TEMPLATE_PARAM_VALUE; + if (Tokenizer_emit(self, TemplateParamEquals)) + return -1; + return 0; +} + +/* + Handle the end of a template at the head of the string. +*/ +static PyObject* Tokenizer_handle_template_end(Tokenizer* self) +{ + PyObject* stack; + + if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { + stack = Tokenizer_pop_keeping_context(self); + if (!stack) + return NULL; + if (Tokenizer_emit_all(self, stack)) { + Py_DECREF(stack); + return NULL; + } + Py_DECREF(stack); + } + self->head++; + stack = Tokenizer_pop(self); + return stack; +} + +/* + Handle the separator between an argument's name and default. +*/ +static int Tokenizer_handle_argument_separator(Tokenizer* self) +{ + self->topstack->context ^= LC_ARGUMENT_NAME; + self->topstack->context |= LC_ARGUMENT_DEFAULT; + if (Tokenizer_emit(self, ArgumentSeparator)) + return -1; + return 0; +} + +/* + Handle the end of an argument at the head of the string. +*/ +static PyObject* Tokenizer_handle_argument_end(Tokenizer* self) +{ + PyObject* stack = Tokenizer_pop(self); + + self->head += 2; + return stack; +} + +/* + Parse an internal wikilink at the head of the wikicode string. +*/ +static int Tokenizer_parse_wikilink(Tokenizer* self) +{ + Py_ssize_t reset; + PyObject *wikilink; + + self->head += 2; + reset = self->head - 1; + wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + if (Tokenizer_emit_text(self, "[[")) + return -1; + return 0; + } + if (!wikilink) + return -1; + if (Tokenizer_emit(self, WikilinkOpen)) { + Py_DECREF(wikilink); + return -1; + } + if (Tokenizer_emit_all(self, wikilink)) { + Py_DECREF(wikilink); + return -1; + } + Py_DECREF(wikilink); + if (Tokenizer_emit(self, WikilinkClose)) + return -1; + return 0; +} + +/* + Handle the separator between a wikilink's title and its text. +*/ +static int Tokenizer_handle_wikilink_separator(Tokenizer* self) +{ + self->topstack->context ^= LC_WIKILINK_TITLE; + self->topstack->context |= LC_WIKILINK_TEXT; + if (Tokenizer_emit(self, WikilinkSeparator)) + return -1; + return 0; +} + +/* + Handle the end of a wikilink at the head of the string. +*/ +static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self) +{ + PyObject* stack = Tokenizer_pop(self); + self->head += 1; + return stack; +} + +/* + Parse the URI scheme of a bracket-enclosed external link. +*/ +static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) +{ + static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; + Textbuffer* buffer; + PyObject* scheme; + Py_UNICODE this; + int slashes, i; + + if (Tokenizer_push(self, LC_EXT_LINK_URI)) + return -1; + if (Tokenizer_READ(self, 0) == '/' && Tokenizer_READ(self, 1) == '/') { + if (Tokenizer_emit_text(self, "//")) + return -1; + self->head += 2; + } + else { + buffer = Textbuffer_new(); + if (!buffer) + return -1; + while ((this = Tokenizer_READ(self, 0))) { + i = 0; + while (1) { + if (!valid[i]) + goto end_of_loop; + if (this == valid[i]) + break; + i++; + } + Textbuffer_write(&buffer, this); + if (Tokenizer_emit_char(self, this)) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head++; + } + end_of_loop: + if (this != ':') { + Textbuffer_dealloc(buffer); + Tokenizer_fail_route(self); + return 0; + } + if (Tokenizer_emit_char(self, ':')) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head++; + slashes = (Tokenizer_READ(self, 0) == '/' && + Tokenizer_READ(self, 1) == '/'); + if (slashes) { + if (Tokenizer_emit_text(self, "//")) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head += 2; + } + scheme = Textbuffer_render(buffer); + Textbuffer_dealloc(buffer); + if (!scheme) + return -1; + if (!IS_SCHEME(scheme, slashes, 0)) { + Py_DECREF(scheme); + Tokenizer_fail_route(self); + return 0; + } + Py_DECREF(scheme); + } + return 0; +} + +/* + Parse the URI scheme of a free (no brackets) external link. +*/ +static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) +{ + static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; + Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer; + PyObject *scheme; + Py_UNICODE chunk; + Py_ssize_t i; + int slashes, j; + + if (!scheme_buffer) + return -1; + // We have to backtrack through the textbuffer looking for our scheme since + // it was just parsed as text: + temp_buffer = self->topstack->textbuffer; + while (temp_buffer) { + for (i = temp_buffer->size - 1; i >= 0; i--) { + chunk = temp_buffer->data[i]; + if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) + goto end_of_loop; + j = 0; + while (1) { + if (!valid[j]) { + Textbuffer_dealloc(scheme_buffer); + FAIL_ROUTE(0); + return 0; + } + if (chunk == valid[j]) + break; + j++; + } + Textbuffer_write(&scheme_buffer, chunk); + } + temp_buffer = temp_buffer->next; + } + end_of_loop: + scheme = Textbuffer_render(scheme_buffer); + if (!scheme) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + slashes = (Tokenizer_READ(self, 0) == '/' && + Tokenizer_READ(self, 1) == '/'); + if (!IS_SCHEME(scheme, slashes, 1)) { + Py_DECREF(scheme); + Textbuffer_dealloc(scheme_buffer); + FAIL_ROUTE(0); + return 0; + } + Py_DECREF(scheme); + if (Tokenizer_push(self, self->topstack->context | LC_EXT_LINK_URI)) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1)) + return -1; + if (Tokenizer_emit_char(self, ':')) + return -1; + if (slashes) { + if (Tokenizer_emit_text(self, "//")) + return -1; + self->head += 2; + } + return 0; +} + +/* + Handle text in a free external link, including trailing punctuation. +*/ +static int +Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, + Textbuffer** tail, Py_UNICODE this) +{ + #define PUSH_TAIL_BUFFER(tail, error) \ + if ((tail)->size || (tail)->next) { \ + if (Tokenizer_emit_textbuffer(self, tail, 0)) \ + return error; \ + tail = Textbuffer_new(); \ + if (!(tail)) \ + return error; \ + } + + if (this == '(' && !(*parens)) { + *parens = 1; + PUSH_TAIL_BUFFER(*tail, -1) + } + else if (this == ',' || this == ';' || this == '\\' || this == '.' || + this == ':' || this == '!' || this == '?' || + (!(*parens) && this == ')')) + return Textbuffer_write(tail, this); + else + PUSH_TAIL_BUFFER(*tail, -1) + return Tokenizer_emit_char(self, this); +} + +/* + Return whether the current head is the end of a free link. +*/ +static int +Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) +{ + // Built from Tokenizer_parse()'s end sentinels: + Py_UNICODE after = Tokenizer_READ(self, 2); + uint64_t ctx = self->topstack->context; + + return (!this || this == '\n' || this == '[' || this == ']' || + this == '<' || this == '>' || (this == '\'' && next == '\'') || + (this == '|' && ctx & LC_TEMPLATE) || + (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || + (this == '}' && next == '}' && + (ctx & LC_TEMPLATE || (after == '}' && ctx & LC_ARGUMENT)))); +} + +/* + Really parse an external link. +*/ +static PyObject* +Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, + Textbuffer** extra) +{ + Py_UNICODE this, next; + int parens = 0; + + if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) : + Tokenizer_parse_free_uri_scheme(self)) + return NULL; + if (BAD_ROUTE) + return NULL; + this = Tokenizer_READ(self, 0); + if (!this || this == '\n' || this == ' ' || this == ']') + return Tokenizer_fail_route(self); + if (!brackets && this == '[') + return Tokenizer_fail_route(self); + while (1) { + this = Tokenizer_READ(self, 0); + next = Tokenizer_READ(self, 1); + if (this == '&') { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_entity(self)) + return NULL; + } + else if (this == '<' && next == '!' + && Tokenizer_READ(self, 2) == '-' + && Tokenizer_READ(self, 3) == '-') { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_comment(self)) + return NULL; + } + else if (!brackets && Tokenizer_is_free_link(self, this, next)) { + self->head--; + return Tokenizer_pop(self); + } + else if (!this || this == '\n') + return Tokenizer_fail_route(self); + else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_template_or_argument(self)) + return NULL; + } + else if (this == ']') + return Tokenizer_pop(self); + else if (this == ' ') { + if (brackets) { + if (Tokenizer_emit(self, ExternalLinkSeparator)) + return NULL; + self->topstack->context ^= LC_EXT_LINK_URI; + self->topstack->context |= LC_EXT_LINK_TITLE; + self->head++; + return Tokenizer_parse(self, 0, 0); + } + if (Textbuffer_write(extra, ' ')) + return NULL; + return Tokenizer_pop(self); + } + else if (!brackets) { + if (Tokenizer_handle_free_link_text(self, &parens, extra, this)) + return NULL; + } + else { + if (Tokenizer_emit_char(self, this)) + return NULL; + } + self->head++; + } +} + +/* + Remove the URI scheme of a new external link from the textbuffer. +*/ +static int +Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) +{ + PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"), + *split, *scheme; + Py_ssize_t length; + Textbuffer* temp; + + if (!text) + return -1; + split = PyObject_CallMethod(text, "split", "si", ":", 1); + Py_DECREF(text); + if (!split) + return -1; + scheme = PyList_GET_ITEM(split, 0); + length = PyUnicode_GET_SIZE(scheme); + while (length) { + temp = self->topstack->textbuffer; + if (length <= temp->size) { + temp->size -= length; + break; + } + length -= temp->size; + self->topstack->textbuffer = temp->next; + free(temp->data); + free(temp); + } + Py_DECREF(split); + return 0; +} + +/* + Parse an external link at the head of the wikicode string. +*/ +static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) +{ + #define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS + #define NOT_A_LINK \ + if (!brackets && self->topstack->context & LC_DLTERM) \ + return Tokenizer_handle_dl_term(self); \ + return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)) + + Py_ssize_t reset = self->head; + PyObject *link, *kwargs; + Textbuffer *extra = 0; + + if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { + NOT_A_LINK; + } + extra = Textbuffer_new(); + if (!extra) + return -1; + self->head++; + link = Tokenizer_really_parse_external_link(self, brackets, &extra); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + Textbuffer_dealloc(extra); + NOT_A_LINK; + } + if (!link) { + Textbuffer_dealloc(extra); + return -1; + } + if (!brackets) { + if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) { + Textbuffer_dealloc(extra); + Py_DECREF(link); + return -1; + } + } + kwargs = PyDict_New(); + if (!kwargs) { + Textbuffer_dealloc(extra); + Py_DECREF(link); + return -1; + } + PyDict_SetItemString(kwargs, "brackets", brackets ? Py_True : Py_False); + if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) { + Textbuffer_dealloc(extra); + Py_DECREF(link); + return -1; + } + if (Tokenizer_emit_all(self, link)) { + Textbuffer_dealloc(extra); + Py_DECREF(link); + return -1; + } + Py_DECREF(link); + if (Tokenizer_emit(self, ExternalLinkClose)) { + Textbuffer_dealloc(extra); + return -1; + } + if (extra->size || extra->next) + return Tokenizer_emit_textbuffer(self, extra, 0); + Textbuffer_dealloc(extra); + return 0; +} + +/* + Parse a section heading at the head of the wikicode string. +*/ +static int Tokenizer_parse_heading(Tokenizer* self) +{ + Py_ssize_t reset = self->head; + int best = 1, i, context, diff; + HeadingData *heading; + PyObject *level, *kwargs; + + self->global |= GL_HEADING; + self->head += 1; + while (Tokenizer_READ(self, 0) == '=') { + best++; + self->head++; + } + context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1); + heading = (HeadingData*) Tokenizer_parse(self, context, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset + best - 1; + for (i = 0; i < best; i++) { + if (Tokenizer_emit_char(self, '=')) + return -1; + } + self->global ^= GL_HEADING; + return 0; + } + level = NEW_INT_FUNC(heading->level); + if (!level) { + Py_DECREF(heading->title); + free(heading); + return -1; + } + kwargs = PyDict_New(); + if (!kwargs) { + Py_DECREF(level); + Py_DECREF(heading->title); + free(heading); + return -1; + } + PyDict_SetItemString(kwargs, "level", level); + Py_DECREF(level); + if (Tokenizer_emit_kwargs(self, HeadingStart, kwargs)) { + Py_DECREF(heading->title); + free(heading); + return -1; + } + if (heading->level < best) { + diff = best - heading->level; + for (i = 0; i < diff; i++) { + if (Tokenizer_emit_char(self, '=')) { + Py_DECREF(heading->title); + free(heading); + return -1; + } + } + } + if (Tokenizer_emit_all(self, heading->title)) { + Py_DECREF(heading->title); + free(heading); + return -1; + } + Py_DECREF(heading->title); + free(heading); + if (Tokenizer_emit(self, HeadingEnd)) + return -1; + self->global ^= GL_HEADING; + return 0; +} + +/* + Handle the end of a section heading at the head of the string. +*/ +static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) +{ + Py_ssize_t reset = self->head; + int best, i, current, level, diff; + HeadingData *after, *heading; + PyObject *stack; + + self->head += 1; + best = 1; + while (Tokenizer_READ(self, 0) == '=') { + best++; + self->head++; + } + current = heading_level_from_context(self->topstack->context); + level = current > best ? (best > 6 ? 6 : best) : + (current > 6 ? 6 : current); + after = (HeadingData*) Tokenizer_parse(self, self->topstack->context, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + if (level < best) { + diff = best - level; + for (i = 0; i < diff; i++) { + if (Tokenizer_emit_char(self, '=')) + return NULL; + } + } + self->head = reset + best - 1; + } + else { + for (i = 0; i < best; i++) { + if (Tokenizer_emit_char(self, '=')) { + Py_DECREF(after->title); + free(after); + return NULL; + } + } + if (Tokenizer_emit_all(self, after->title)) { + Py_DECREF(after->title); + free(after); + return NULL; + } + Py_DECREF(after->title); + level = after->level; + free(after); + } + stack = Tokenizer_pop(self); + if (!stack) + return NULL; + heading = malloc(sizeof(HeadingData)); + if (!heading) { + PyErr_NoMemory(); + return NULL; + } + heading->title = stack; + heading->level = level; + return heading; +} + +/* + Actually parse an HTML entity and ensure that it is valid. +*/ +static int Tokenizer_really_parse_entity(Tokenizer* self) +{ + PyObject *kwargs, *textobj; + Py_UNICODE this; + int numeric, hexadecimal, i, j, zeroes, test; + char *valid, *text, *buffer, *def; + + #define FAIL_ROUTE_AND_EXIT() { \ + Tokenizer_fail_route(self); \ + free(text); \ + return 0; \ + } + + if (Tokenizer_emit(self, HTMLEntityStart)) + return -1; + self->head++; + this = Tokenizer_READ(self, 0); + if (!this) { + Tokenizer_fail_route(self); + return 0; + } + if (this == '#') { + numeric = 1; + if (Tokenizer_emit(self, HTMLEntityNumeric)) + return -1; + self->head++; + this = Tokenizer_READ(self, 0); + if (!this) { + Tokenizer_fail_route(self); + return 0; + } + if (this == 'x' || this == 'X') { + hexadecimal = 1; + kwargs = PyDict_New(); + if (!kwargs) + return -1; + PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0)); + if (Tokenizer_emit_kwargs(self, HTMLEntityHex, kwargs)) + return -1; + self->head++; + } + else + hexadecimal = 0; + } + else + numeric = hexadecimal = 0; + if (hexadecimal) + valid = HEXDIGITS; + else if (numeric) + valid = DIGITS; + else + valid = ALPHANUM; + text = calloc(MAX_ENTITY_SIZE, sizeof(char)); + if (!text) { + PyErr_NoMemory(); + return -1; + } + i = 0; + zeroes = 0; + while (1) { + this = Tokenizer_READ(self, 0); + if (this == ';') { + if (i == 0) + FAIL_ROUTE_AND_EXIT() + break; + } + if (i == 0 && this == '0') { + zeroes++; + self->head++; + continue; + } + if (i >= MAX_ENTITY_SIZE) + FAIL_ROUTE_AND_EXIT() + if (is_marker(this)) + FAIL_ROUTE_AND_EXIT() + j = 0; + while (1) { + if (!valid[j]) + FAIL_ROUTE_AND_EXIT() + if (this == valid[j]) + break; + j++; + } + text[i] = (char) this; + self->head++; + i++; + } + if (numeric) { + sscanf(text, (hexadecimal ? "%x" : "%d"), &test); + if (test < 1 || test > 0x10FFFF) + FAIL_ROUTE_AND_EXIT() + } + else { + i = 0; + while (1) { + def = entitydefs[i]; + if (!def) // We've reached the end of the defs without finding it + FAIL_ROUTE_AND_EXIT() + if (strcmp(text, def) == 0) + break; + i++; + } + } + if (zeroes) { + buffer = calloc(strlen(text) + zeroes + 1, sizeof(char)); + if (!buffer) { + free(text); + PyErr_NoMemory(); + return -1; + } + for (i = 0; i < zeroes; i++) + strcat(buffer, "0"); + strcat(buffer, text); + free(text); + text = buffer; + } + textobj = PyUnicode_FromString(text); + if (!textobj) { + free(text); + return -1; + } + free(text); + kwargs = PyDict_New(); + if (!kwargs) { + Py_DECREF(textobj); + return -1; + } + PyDict_SetItemString(kwargs, "text", textobj); + Py_DECREF(textobj); + if (Tokenizer_emit_kwargs(self, Text, kwargs)) + return -1; + if (Tokenizer_emit(self, HTMLEntityEnd)) + return -1; + return 0; +} + +/* + Parse an HTML entity at the head of the wikicode string. +*/ +static int Tokenizer_parse_entity(Tokenizer* self) +{ + Py_ssize_t reset = self->head; + PyObject *tokenlist; + + if (Tokenizer_push(self, 0)) + return -1; + if (Tokenizer_really_parse_entity(self)) + return -1; + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + if (Tokenizer_emit_char(self, '&')) + return -1; + return 0; + } + tokenlist = Tokenizer_pop(self); + if (!tokenlist) + return -1; + if (Tokenizer_emit_all(self, tokenlist)) { + Py_DECREF(tokenlist); + return -1; + } + Py_DECREF(tokenlist); + return 0; +} + +/* + Parse an HTML comment at the head of the wikicode string. +*/ +static int Tokenizer_parse_comment(Tokenizer* self) +{ + Py_ssize_t reset = self->head + 3; + PyObject *comment; + Py_UNICODE this; + + self->head += 4; + if (Tokenizer_push(self, 0)) + return -1; + while (1) { + this = Tokenizer_READ(self, 0); + if (!this) { + comment = Tokenizer_pop(self); + Py_XDECREF(comment); + self->head = reset; + return Tokenizer_emit_text(self, "