Kaynağa Gözat

Working on the framework for the tokenizer, plus some cleanup, fixes.

tags/v0.1
Ben Kurtovic 12 yıl önce
ebeveyn
işleme
81e5ce30af
5 değiştirilmiş dosya ile 100 ekleme ve 57 silme
  1. +1
    -1
      mwparserfromhell/nodes/template.py
  2. +6
    -2
      mwparserfromhell/parser/__init__.py
  3. +28
    -28
      mwparserfromhell/parser/builder.py
  4. +43
    -4
      mwparserfromhell/parser/tokenizer.py
  5. +22
    -22
      mwparserfromhell/parser/tokens.py

+ 1
- 1
mwparserfromhell/nodes/template.py Dosyayı Görüntüle

@@ -90,7 +90,7 @@ class Template(Node):
before_theories = defaultdict(lambda: 0)
after_theories = defaultdict(lambda: 0)
for param in self.params:
match = re.search("^(\s*).*?(\s*)$", unicode(param.value), FLAGS)
match = re.search(r"^(\s*).*?(\s*)$", unicode(param.value), FLAGS)
before, after = match.group(1), match.group(2)
before_theories[before] += 1
after_theories[after] += 1


+ 6
- 2
mwparserfromhell/parser/__init__.py Dosyayı Görüntüle

@@ -20,8 +20,12 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from .builder import Builder
from .tokenizer import Tokenizer
try:
from ._builder import CBuilder as Builder
from ._tokenizer import CTokenizer as Tokenizer
except ImportError:
from .builder import Builder
from .tokenizer import Tokenizer

__all__ = ["Parser"]



+ 28
- 28
mwparserfromhell/parser/builder.py Dosyayı Görüntüle

@@ -49,17 +49,17 @@ class Builder(object):
self._push()
while self._tokens:
token = self._tokens.pop(0)
if isinstance(token, tokens.TEMPLATE_PARAM_EQUALS):
if isinstance(token, tokens.TemplateParamEquals):
key = self._pop()
showkey = True
self._push()
elif isinstance(token, (tokens.TEMPLATE_PARAM_SEPARATOR,
tokens.TEMPLATE_CLOSE)):
elif isinstance(token, (tokens.TemplateParamSeparator,
tokens.TemplateClose)):
self._tokens.insert(0, token)
value = self._pop()
return Parameter(key, value, showkey)
else:
self._stack.write(self._handle_token())
self._write(self._handle_token())

def _handle_template(self):
params = []
@@ -68,7 +68,7 @@ class Builder(object):
self._push()
while self._tokens:
token = self._tokens.pop(0)
if isinstance(token, tokens.TEMPLATE_PARAM_SEPARATOR):
if isinstance(token, tokens.TemplateParamSeparator):
if not params:
name = self._pop()
param = self._handle_parameter(min(int_key_range - int_keys))
@@ -76,18 +76,18 @@ class Builder(object):
int_keys.add(int(param.name))
int_key_range.add(len(int_keys) + 1)
params.append(param)
elif isinstance(token, tokens.TEMPLATE_CLOSE):
elif isinstance(token, tokens.TemplateClose):
if not params:
name = self._pop()
return Template(name, params)
else:
self._stack.write(self._handle_token())
self._write(self._handle_token())

def _handle_entity(self):
token = self._tokens.pop(0)
if isinstance(token, tokens.HTML_ENTITY_NUMERIC):
if isinstance(token, tokens.HTMLEntityNumeric):
token = self._tokens.pop(0)
if isinstance(token, tokens.HTML_ENTITY_HEX):
if isinstance(token, tokens.HTMLEntityHex):
token = self._tokens.pop(0)
return HTMLEntity(token.text, named=False, hexadecimal=True)
return HTMLEntity(token.text, named=False, hexadecimal=False)
@@ -98,30 +98,30 @@ class Builder(object):
self._push()
while self._tokens:
token = self._tokens.pop(0)
if isinstance(token, tokens.HEADING_BLOCK):
if isinstance(token, tokens.HeadingBlock):
title = self._pop()
return Heading(title, level)
else:
self._stack.write(self._handle_token())
self._write(self._handle_token())

def _handle_attribute(self):
name, quoted = None, False
self._push()
while self._tokens:
token = self._tokens.pop(0)
if isinstance(token, tokens.TAG_ATTR_EQUALS):
if isinstance(token, tokens.TagAttrEquals):
name = self._pop()
self._push()
elif isinstance(token, tokens.TAG_ATTR_QUOTE):
elif isinstance(token, tokens.TagAttrQuote):
quoted = True
elif isinstance(token, (tokens.TAG_ATTR_START,
tokens.TAG_CLOSE_OPEN)):
elif isinstance(token, (tokens.TagAttrStart,
tokens.TagCloseOpen)):
self._tokens.insert(0, token)
if name is not None:
return Attribute(name, self._pop(), quoted)
return Attribute(self._pop(), quoted=quoted)
else:
self._stack.write(self._handle_token())
self._write(self._handle_token())

def _handle_tag(self, token):
type_, showtag = token.type, token.showtag
@@ -129,40 +129,40 @@ class Builder(object):
self._push()
while self._tokens:
token = self._tokens.pop(0)
if isinstance(token, tokens.TAG_ATTR_START):
if isinstance(token, tokens.TagAttrStart):
attrs.append(self._handle_attribute())
elif isinstance(token, tokens.TAG_CLOSE_OPEN):
elif isinstance(token, tokens.TagCloseOpen):
open_pad = token.padding
tag = self._pop()
self._push()
elif isinstance(token, tokens.TAG_CLOSE_SELFCLOSE):
elif isinstance(token, tokens.TagCloseSelfclose):
tag = self._pop()
return Tag(type_, tag, attrs=attrs, showtag=showtag,
self_closing=True, open_padding=token.padding)
elif isinstance(token, tokens.TAG_OPEN_CLOSE):
elif isinstance(token, tokens.TagOpenClose):
contents = self._pop()
elif isinstance(token, tokens.TAG_CLOSE_CLOSE):
elif isinstance(token, tokens.TagCloseClose):
return Tag(type_, tag, contents, attrs, showtag, False,
open_pad, token.padding)
else:
self._stack.write(self._handle_token())
self._write(self._handle_token())

def _handle_token(self):
token = self._tokens.pop(0)
if isinstance(token, tokens.TEXT):
if isinstance(token, tokens.Text):
return Text(token.text)
elif isinstance(token, tokens.TEMPLATE_OPEN):
elif isinstance(token, tokens.TemplateOpen):
return self._handle_template()
elif isinstance(token, tokens.HTML_ENTITY_START):
elif isinstance(token, tokens.HTMLEntityStart):
return self._handle_entity()
elif isinstance(token, tokens.HEADING_BLOCK):
elif isinstance(token, tokens.HeadingBlock):
return self._handle_heading(token)
elif isinstance(token, tokens.TAG_OPEN_OPEN):
elif isinstance(token, tokens.TagOpenOpen):
return self._handle_tag(token)

def build(self, tokenlist):
self._tokens = tokenlist
self._push()
while self._tokens:
self._stack.write(self._handle_token())
self._write(self._handle_token())
return self._pop()

+ 43
- 4
mwparserfromhell/parser/tokenizer.py Dosyayı Görüntüle

@@ -25,12 +25,51 @@ from . import tokens
__all__ = ["Tokenizer"]

class Tokenizer(object):
START = object()
END = object()

def __init__(self):
self._text = None
self._head = 0
self._tokens = []
self._stacks = []

self._modifiers = []

def _push(self):
self._stacks.append([])

def _pop(self):
return self._stacks.pop()

def _write(self, token, stack=None):
if stack is None:
stack = self._stacks[-1]
if not stack:
stack.append(token)
return
last = stack[-1]
if isinstance(token, tokens.Text) and isinstance(last, tokens.Text):
last.text += token.text
else:
stack.append(token)

def _read(self, delta=0, wrap=False):
index = self._head + delta
if index < 0 and (not wrap or abs(index) > len(self._text)):
return self.START
if index >= len(self._text):
return self.END
return self._text[index]

def _parse_until(self, stop):
self._push()
while True:
if self._read() in (stop, self.END):
return self._pop()
else:
self._write(tokens.Text(text=self._read()))
self._head += 1

def tokenize(self, text):
self._text = text
self._tokens.append(tokens.TEXT(text=text))
return self._tokens
self._text = list(text)
return self._parse_until(stop=self.END)

+ 22
- 22
mwparserfromhell/parser/tokens.py Dosyayı Görüntüle

@@ -24,43 +24,43 @@ __all__ = ["Token"]

class Token(object):
def __init__(self, **kwargs):
self.__kwargs = kwargs
super(Token, self).__setattr__("_kwargs", kwargs)

def __getattr__(self, key):
return self.__kwargs[key]
return self._kwargs[key]

def __setattr__(self, key, value):
self.__kwargs[key] = value
self._kwargs[key] = value

def __delattr__(self, key):
del self.__kwargs[key]
del self._kwargs[key]


def make(name):
__all__.append(name)
return type(name, (Token,), {})

TEXT = make("TEXT")
Text = make("Text")

TEMPLATE_OPEN = make("TEMPLATE_OPEN") # {{
TEMPLATE_PARAM_SEPARATOR = make("TEMPLATE_PARAM_SEPARATOR") # |
TEMPLATE_PARAM_EQUALS = make("TEMPLATE_PARAM_EQUALS") # =
TEMPLATE_CLOSE = make("TEMPLATE_CLOSE") # }}
TemplateOpen = make("TemplateOpen") # {{
TemplateParamSeparator = make("TemplateParamSeparator") # |
TemplateParamEquals = make("TemplateParamEquals") # =
TemplateClose = make("TemplateClose") # }}

HTML_ENTITY_START = make("HTML_ENTITY_START") # &
HTML_ENTITY_NUMERIC = make("HTML_ENTITY_NUMERIC") # #
HTML_ENTITY_HEX = make("HTML_ENTITY_HEX") # x
HTML_ENTITY_END = make("HTML_ENTITY_END") # ;
HTMLEntityStart = make("HTMLEntityStart") # &
HTMLEntityNumeric = make("HTMLEntityNumeric") # #
HTMLEntityHex = make("HTMLEntityHex") # X
HTMLEntityEnd = make("HTMLEntityEnd") # ;

HEADING_BLOCK = make("HEADING_BLOCK") # =...
HeadingBlock = make("HeadingBlock") # =...

TAG_OPEN_OPEN = make("TAG_OPEN_OPEN") # <
TAG_ATTR_START = make("TAG_ATTR_START")
TAG_ATTR_EQUALS = make("TAG_ATTR_EQUALS") # =
TAG_ATTR_QUOTE = make("TAG_ATTR_QUOTE") # "
TAG_CLOSE_OPEN = make("TAG_CLOSE_OPEN") # >
TAG_CLOSE_SELFCLOSE = make("TAG_CLOSE_SELFCLOSE") # />
TAG_OPEN_CLOSE = make("TAG_OPEN_CLOSE") # </
TAG_CLOSE_CLOSE = make("TAG_CLOSE_CLOSE") # >
TagOpenOpen = make("TagOpenOpen") # <
TagAttrStart = make("TagAttrStart")
TagAttrEquals = make("TagAttrEquals") # =
TagAttrQuote = make("TagAttrQuote") # "
TagCloseOpen = make("TagCloseOpen") # >
TagCloseSelfclose = make("TagCloseSelfclose") # />
TagOpenClose = make("TagOpenClose") # </
TagCloseClose = make("TagCloseClose") # >

del make

Yükleniyor…
İptal
Kaydet