瀏覽代碼

Minor refactor in HTML parser.

tags/v0.3
Ben Kurtovic 8 年之前
父節點
當前提交
f2099df5d5
共有 1 個文件被更改,包括 15 次插入6 次删除
  1. +15
    -6
      earwigbot/wiki/copyvios/parsers.py

+ 15
- 6
earwigbot/wiki/copyvios/parsers.py 查看文件

@@ -232,6 +232,20 @@ class _HTMLParser(_BaseTextParser):
"script", "style"
]

def _fail_if_mirror(self, soup):
"""Look for obvious signs that the given soup is a wiki mirror.

If so, raise ParserExclusionError, which is caught in the workers and
causes this source to excluded.
"""
if "mirror_hints" not in self._args:
return

func = lambda attr: attr and any(
hint in attr for hint in self._args["mirror_hints"])
if soup.find_all(href=func) or soup.find_all(src=func):
raise ParserExclusionError()

def parse(self):
"""Return the actual text contained within an HTML document.

@@ -248,12 +262,7 @@ class _HTMLParser(_BaseTextParser):
# no scrapable content (possibly JS or <frame> magic):
return ""

if "mirror_hints" in self._args:
# Look for obvious signs that this is a mirror:
func = lambda attr: attr and any(
hint in attr for hint in self._args["mirror_hints"])
if soup.find_all(href=func) or soup.find_all(src=func):
raise ParserExclusionError()
self._fail_if_mirror(soup)

soup = soup.body
is_comment = lambda text: isinstance(text, bs4.element.Comment)


Loading…
取消
儲存