diff --git a/.gitignore b/.gitignore index ec4e8ca..4068716 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.pyc *.so +*.dll *.egg *.egg-info .DS_Store diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 0000000..cbe2933 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,41 @@ +v0.3 (unreleased): + +- Various fixes and cleanup. + +v0.2 (released June 20, 2013): + +- The parser now fully supports Python 3 in addition to Python 2.7. +- Added a C tokenizer extension that is significantly faster than its Python + equivalent. It is enabled by default (if available) and can be toggled by + setting `mwparserfromhell.parser.use_c` to a boolean value. +- Added a complete set of unit tests covering parsing and wikicode + manipulation. +- Renamed Wikicode.filter_links() to filter_wikilinks() (applies to ifilter as + well). +- Added filter methods for Arguments, Comments, Headings, and HTMLEntities. +- Added 'before' param to Template.add(); renamed 'force_nonconformity' to + 'preserve_spacing'. +- Added 'include_lead' param to Wikicode.get_sections(). +- Removed 'flat' param from Wikicode.get_sections(). +- Removed 'force_no_field' param from Template.remove(). +- Added support for Travis CI. +- Added note about Windows build issue in the README. +- The tokenizer will limit itself to a realistic recursion depth to prevent + errors and unreasonably long parse times. +- Fixed how some nodes' attribute setters handle input. +- Fixed multiple bugs in the tokenizer's handling of invalid markup. +- Fixed bugs in the implementation of SmartList and StringMixIn. +- Fixed some broken example code in the README; other copyedits. +- Other bugfixes and code cleanup. + +v0.1.1 (released September 21, 2012): + +- Added support for Comments () and Wikilinks ([[foo]]). +- Added corresponding ifilter_links() and filter_links() methods to Wikicode. +- Fixed a bug when parsing incomplete templates. +- Fixed strip_code() to affect the contents of headings. +- Various copyedits in documentation and comments. + +v0.1 (released August 23, 2012): + +- Initial release. diff --git a/README.rst b/README.rst index 77c01eb..df4d732 100644 --- a/README.rst +++ b/README.rst @@ -9,7 +9,8 @@ mwparserfromhell that provides an easy-to-use and outrageously powerful parser for MediaWiki_ wikicode. It supports Python 2 and Python 3. -Developed by Earwig_ with help from `Σ`_. +Developed by Earwig_ with help from `Σ`_. Full documentation is available on +ReadTheDocs_. Installation ------------ @@ -142,6 +143,7 @@ following code (via the API_):: return mwparserfromhell.parse(text) .. _MediaWiki: http://mediawiki.org +.. _ReadTheDocs: http://mwparserfromhell.readthedocs.org .. _Earwig: http://en.wikipedia.org/wiki/User:The_Earwig .. _Σ: http://en.wikipedia.org/wiki/User:%CE%A3 .. _Python Package Index: http://pypi.python.org diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 0000000..4bf86b7 --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1,66 @@ +Changelog +========= + +v0.3 +---- + +Unreleased +(`changes `__): + +- Various fixes and cleanup. + +v0.2 +---- + +`Released June 20, 2013 `_ +(`changes `__): + +- The parser now fully supports Python 3 in addition to Python 2.7. +- Added a C tokenizer extension that is significantly faster than its Python + equivalent. It is enabled by default (if available) and can be toggled by + setting :py:attr:`mwparserfromhell.parser.use_c` to a boolean value. +- Added a complete set of unit tests covering parsing and wikicode + manipulation. +- Renamed :py:meth:`.filter_links` to :py:meth:`.filter_wikilinks` (applies to + :py:meth:`.ifilter` as well). +- Added filter methods for :py:class:`Arguments <.Argument>`, + :py:class:`Comments <.Comment>`, :py:class:`Headings <.Heading>`, and + :py:class:`HTMLEntities <.HTMLEntity>`. +- Added *before* param to :py:meth:`Template.add() <.Template.add>`; renamed + *force_nonconformity* to *preserve_spacing*. +- Added *include_lead* param to :py:meth:`Wikicode.get_sections() + <.get_sections>`. +- Removed *flat* param from :py:meth:`.get_sections`. +- Removed *force_no_field* param from :py:meth:`Template.remove() + <.Template.remove>`. +- Added support for Travis CI. +- Added note about Windows build issue in the README. +- The tokenizer will limit itself to a realistic recursion depth to prevent + errors and unreasonably long parse times. +- Fixed how some nodes' attribute setters handle input. +- Fixed multiple bugs in the tokenizer's handling of invalid markup. +- Fixed bugs in the implementation of :py:class:`.SmartList` and + :py:class:`.StringMixIn`. +- Fixed some broken example code in the README; other copyedits. +- Other bugfixes and code cleanup. + +v0.1.1 +------ + +`Released September 21, 2012 `_ +(`changes `__): + +- Added support for :py:class:`Comments <.Comment>` (````) and + :py:class:`Wikilinks <.Wikilink>` (``[[foo]]``). +- Added corresponding :py:meth:`.ifilter_links` and :py:meth:`.filter_links` + methods to :py:class:`.Wikicode`. +- Fixed a bug when parsing incomplete templates. +- Fixed :py:meth:`.strip_code` to affect the contents of headings. +- Various copyedits in documentation and comments. + +v0.1 +---- + +`Released August 23, 2012 `_: + +- Initial release. diff --git a/docs/index.rst b/docs/index.rst index 4b4c392..0603daf 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,5 +1,5 @@ -MWParserFromHell v0.2 Documentation -=================================== +MWParserFromHell v\ |version| Documentation +=========================================== :py:mod:`mwparserfromhell` (the *MediaWiki Parser from Hell*) is a Python package that provides an easy-to-use and outrageously powerful parser for @@ -41,6 +41,7 @@ Contents usage integration + changelog API Reference diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 99bc0c2..738d4c2 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -31,7 +31,7 @@ from __future__ import unicode_literals __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012, 2013 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.2.dev" +__version__ = "0.3.dev" __email__ = "ben.kurtovic@verizon.net" from . import compat, nodes, parser, smart_list, string_mixin, utils, wikicode diff --git a/mwparserfromhell/compat.py b/mwparserfromhell/compat.py old mode 100755 new mode 100644 diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 3834d41..6dfc4f0 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -293,7 +293,7 @@ class Template(Node): """ name = name.strip() if isinstance(name, basestring) else str(name) removed = False - to_remove =[] + to_remove = [] for i, param in enumerate(self.params): if param.name.strip() == name: if keep_field: diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index ca9fe8a..99f8c9c 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -23,9 +23,16 @@ SOFTWARE. #include "tokenizer.h" -double log2(double n) +/* + Given a context, return the heading level encoded within it. +*/ +static int heading_level_from_context(int n) { - return log(n) / log(2); + int level; + n /= LC_HEADING_LEVEL_1; + for (level = 1; n > 1; n >>= 1) + level++; + return level; } static PyObject* @@ -175,6 +182,9 @@ Tokenizer_push_textbuffer(Tokenizer* self) return 0; } +/* + Pop and deallocate the top token stack/context/textbuffer. +*/ static void Tokenizer_delete_top_of_stack(Tokenizer* self) { @@ -857,7 +867,7 @@ Tokenizer_handle_heading_end(Tokenizer* self) best++; self->head++; } - current = log2(self->topstack->context / LC_HEADING_LEVEL_1) + 1; + current = heading_level_from_context(self->topstack->context); level = current > best ? (best > 6 ? 6 : best) : (current > 6 ? 6 : current); after = (HeadingData*) Tokenizer_parse(self, self->topstack->context); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index cdc0cca..1f58c49 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -181,6 +181,7 @@ typedef struct { /* Function prototypes: */ +static int heading_level_from_context(int); static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); static struct Textbuffer* Textbuffer_new(void); static void Tokenizer_dealloc(Tokenizer*); diff --git a/mwparserfromhell/string_mixin.py b/mwparserfromhell/string_mixin.py index 89c1bc0..a406401 100644 --- a/mwparserfromhell/string_mixin.py +++ b/mwparserfromhell/string_mixin.py @@ -253,12 +253,12 @@ class StringMixIn(object): if py3k: @staticmethod @inheritdoc - def maketrans(self, x, y=None, z=None): + def maketrans(x, y=None, z=None): if z is None: if y is None: - return self.__unicode__.maketrans(x) - return self.__unicode__.maketrans(x, y) - return self.__unicode__.maketrans(x, y, z) + return str.maketrans(x) + return str.maketrans(x, y) + return str.maketrans(x, y, z) @inheritdoc def partition(self, sep): diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index 581707d..4ec889e 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -168,7 +168,7 @@ class Wikicode(StringMixIn): doc = """Iterate over {0}. This is equivalent to :py:meth:`{1}` with *forcetype* set to - :py:class:`~.{2}`. + :py:class:`~{2.__module__}.{2.__name__}`. """ make_ifilter = lambda ftype: (lambda self, **kw: self.ifilter(forcetype=ftype, **kw)) @@ -177,8 +177,8 @@ class Wikicode(StringMixIn): for name, ftype in (meths.items() if py3k else meths.iteritems()): ifilter = make_ifilter(ftype) filter = make_filter(ftype) - ifilter.__doc__ = doc.format(name, "ifilter", ftype.__name__) - filter.__doc__ = doc.format(name, "filter", ftype.__name__) + ifilter.__doc__ = doc.format(name, "ifilter", ftype) + filter.__doc__ = doc.format(name, "filter", ftype) setattr(cls, "ifilter_" + name, ifilter) setattr(cls, "filter_" + name, filter) diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index 382a9bf..c1d49cb 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -109,7 +109,7 @@ class TokenizerTestCase(object): def build(cls): """Load and install all tests from the 'tokenizer' directory.""" def load_file(filename): - with open(filename, "r") as fp: + with open(filename, "rU") as fp: text = fp.read() if not py3k: text = text.decode("utf8") diff --git a/tests/test_string_mixin.py b/tests/test_string_mixin.py index 306f2fd..b829bb2 100644 --- a/tests/test_string_mixin.py +++ b/tests/test_string_mixin.py @@ -414,10 +414,10 @@ class TestStringMixIn(unittest.TestCase): self.assertEqual("Fake String", str1.title()) if py3k: - table1 = str.maketrans({97: "1", 101: "2", 105: "3", 111: "4", - 117: "5"}) - table2 = str.maketrans("aeiou", "12345") - table3 = str.maketrans("aeiou", "12345", "rts") + table1 = StringMixIn.maketrans({97: "1", 101: "2", 105: "3", + 111: "4", 117: "5"}) + table2 = StringMixIn.maketrans("aeiou", "12345") + table3 = StringMixIn.maketrans("aeiou", "12345", "rts") self.assertEqual("f1k2 str3ng", str1.translate(table1)) self.assertEqual("f1k2 str3ng", str1.translate(table2)) self.assertEqual("f1k2 3ng", str1.translate(table3))