Просмотр исходного кода

copyvios: Refactor some parsing logic and add dynamic Blogger support

tags/v0.4
Ben Kurtovic 3 лет назад
Родитель
Сommit
2324a73624
2 измененных файлов: 114 добавлений и 39 удалений
  1. +68
    -19
      earwigbot/wiki/copyvios/parsers.py
  2. +46
    -20
      earwigbot/wiki/copyvios/workers.py

+ 68
- 19
earwigbot/wiki/copyvios/parsers.py Просмотреть файл

@@ -20,9 +20,11 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import json
from os import path
import re
from StringIO import StringIO
import urllib
import urlparse

import mwparserfromhell
@@ -246,40 +248,87 @@ class _HTMLParser(_BaseTextParser):
if soup.find_all(href=func) or soup.find_all(src=func):
raise ParserExclusionError()

@staticmethod
def _get_soup(text):
"""Parse some text using BeautifulSoup."""
try:
return bs4.BeautifulSoup(text, "lxml")
except ValueError:
return bs4.BeautifulSoup(text)

def _clean_soup(self, soup):
"""Clean a BeautifulSoup tree of invisible tags."""
is_comment = lambda text: isinstance(text, bs4.element.Comment)
for comment in soup.find_all(text=is_comment):
comment.extract()
for tag in self.hidden_tags:
for element in soup.find_all(tag):
element.extract()

return "\n".join(soup.stripped_strings)

def _open(self, url):
"""Try to read a URL. Return None if it couldn't be read."""
opener = self._args.get("open_url")
if not opener:
return None
result = opener(url)
return result.content if result else None

def _load_from_blogspot(self, url):
"""Load dynamic content from Blogger Dynamic Views."""
match = re.search(r"'postId': '(\d+)'", self.text)
if not match:
return ""
post_id = match.groups(1)
url = "https://%s/feeds/posts/default/%s" % (url.netloc, post_id)
params = {
"alt": "json",
"v": "2",
"dynamicviews": "1",
"rewriteforssl": "true",
}
raw = self._open(url + urllib.urlencode(params))
if raw is None:
return ""
try:
parsed = json.loads(raw)
except ValueError:
return ""
try:
text = parsed["entry"]["content"]["$t"]
except KeyError:
return ""
soup = self._get_soup(text)
return self._clean_soup(soup.body)

def parse(self):
"""Return the actual text contained within an HTML document.

Implemented using :py:mod:`BeautifulSoup <bs4>`
(http://www.crummy.com/software/BeautifulSoup/).
"""
try:
soup = bs4.BeautifulSoup(self.text, "lxml")
except ValueError:
soup = bs4.BeautifulSoup(self.text)

url = urlparse.urlparse(self.url) if self.url else None
soup = self._get_soup(self.text)
if not soup.body:
# No <body> tag present in HTML ->
# no scrapable content (possibly JS or <iframe> magic):
return ""

self._fail_if_mirror(soup)
soup = soup.body
body = soup.body

if self.url:
url = urlparse.urlparse(self.url)
if url.netloc == "web.archive.org" and url.path.endswith(".pdf"):
playback = soup.find(id="playback")
if playback and "src" in playback.attrs:
raise ParserRedirectError(playback.attrs["src"])
if url and url.netloc == "web.archive.org" and url.path.endswith(".pdf"):
playback = body.find(id="playback")
if playback and "src" in playback.attrs:
raise ParserRedirectError(playback.attrs["src"])

is_comment = lambda text: isinstance(text, bs4.element.Comment)
for comment in soup.find_all(text=is_comment):
comment.extract()
for tag in self.hidden_tags:
for element in soup.find_all(tag):
element.extract()
content = self._clean_soup(body)

return "\n".join(soup.stripped_strings)
if url and url.netloc.endswith(".blogspot.com") and not content:
content = self._load_from_blogspot(url)

return content


class _PDFParser(_BaseTextParser):


+ 46
- 20
earwigbot/wiki/copyvios/workers.py Просмотреть файл

@@ -20,7 +20,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import collections
from collections import deque
import functools
from gzip import GzipFile
from httplib import HTTPException
from logging import getLogger
@@ -30,7 +32,7 @@ from socket import error as socket_error
from StringIO import StringIO
from struct import error as struct_error
from threading import Lock, Thread
from time import time
import time
from urllib2 import build_opener, URLError

from earwigbot import importer
@@ -44,11 +46,14 @@ tldextract = importer.new("tldextract")
__all__ = ["globalize", "localize", "CopyvioWorkspace"]

_MAX_REDIRECTS = 3
_MAX_RAW_SIZE = 20 * 1024 ** 2

_is_globalized = False
_global_queues = None
_global_workers = []

_OpenedURL = collections.namedtuple('_OpenedURL', ['content', 'parser_class'])

def globalize(num_workers=8):
"""Cause all copyvio checks to be done by one global set of workers.

@@ -113,23 +118,15 @@ class _CopyvioWorker(object):
self._opener = build_opener()
self._logger = getLogger("earwigbot.wiki.cvworker." + name)

def _open_url(self, source, redirects=0):
"""Open a URL and return its parsed content, or None.

First, we will decompress the content if the headers contain "gzip" as
its content encoding. Then, we will return the content stripped using
an HTML parser if the headers indicate it is HTML, or return the
content directly if it is plain text. If we don't understand the
content type, we'll return None.
def _open_url_raw(self, url, timeout=5, allow_content_types=None):
"""Open a URL, without parsing it.

If a URLError was raised while opening the URL or an IOError was raised
while decompressing, None will be returned.
None will be returned for URLs that cannot be read for whatever reason.
"""
if source.headers:
self._opener.addheaders = source.headers
url = source.url.encode("utf8")
if not isinstance(url, unicode):
url = url.encode("utf8")
try:
response = self._opener.open(url, timeout=source.timeout)
response = self._opener.open(url, timeout=timeout)
except (URLError, HTTPException, socket_error, ValueError):
return None

@@ -146,9 +143,13 @@ class _CopyvioWorker(object):
return None

try:
content = response.read()
# Additional safety check for pages using Transfer-Encoding: chunked
# where we can't read the Content-Length
content = response.read(_MAX_RAW_SIZE + 1)
except (URLError, socket_error):
return None
if len(content) > _MAX_RAW_SIZE:
return None

if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(content)
@@ -158,7 +159,32 @@ class _CopyvioWorker(object):
except (IOError, struct_error):
return None

parser = parser_class(content, url=url, args=source.parser_args)
if len(content) > _MAX_RAW_SIZE:
return None
return _OpenedURL(content, parser_class)

def _open_url(self, source, redirects=0):
"""Open a URL and return its parsed content, or None.

First, we will decompress the content if the headers contain "gzip" as
its content encoding. Then, we will return the content stripped using
an HTML parser if the headers indicate it is HTML, or return the
content directly if it is plain text. If we don't understand the
content type, we'll return None.

If a URLError was raised while opening the URL or an IOError was raised
while decompressing, None will be returned.
"""
if source.headers:
self._opener.addheaders = source.headers

result = self._open_url_raw(source.url, timeout=source.timeout)
if result is None:
return None

args = source.parser_args.copy()
args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout)
parser = result.parser_class(result.content, url=source.url, args=args)
try:
return parser.parse()
except ParserRedirectError as exc:
@@ -170,7 +196,7 @@ class _CopyvioWorker(object):
def _acquire_new_site(self):
"""Block for a new unassigned site queue."""
if self._until:
timeout = self._until - time()
timeout = self._until - time.time()
if timeout <= 0:
raise Empty
else:
@@ -269,7 +295,7 @@ class CopyvioWorkspace(object):
self._article = article
self._logger = logger.getChild("copyvios")
self._min_confidence = min_confidence
self._start_time = time()
self._start_time = time.time()
self._until = (self._start_time + max_time) if max_time > 0 else None
self._handled_urls = set()
self._finish_lock = Lock()
@@ -407,5 +433,5 @@ class CopyvioWorkspace(object):

self.sources.sort(cmpfunc)
return CopyvioCheckResult(self.finished, self.sources, num_queries,
time() - self._start_time, self._article,
time.time() - self._start_time, self._article,
self.possible_miss)

Загрузка…
Отмена
Сохранить