Browse Source

Merge branch 'feature/cv2015' into develop

tags/v0.2
Ben Kurtovic 8 years ago
parent
commit
d8202e6094
8 changed files with 91 additions and 31 deletions
  1. +2
    -1
      CHANGELOG
  2. +12
    -9
      earwigbot/exceptions.py
  3. +2
    -2
      earwigbot/wiki/copyvios/__init__.py
  4. +3
    -1
      earwigbot/wiki/copyvios/exclusions.py
  5. +1
    -1
      earwigbot/wiki/copyvios/markov.py
  6. +35
    -6
      earwigbot/wiki/copyvios/parsers.py
  7. +12
    -3
      earwigbot/wiki/copyvios/result.py
  8. +24
    -8
      earwigbot/wiki/copyvios/workers.py

+ 2
- 1
CHANGELOG View File

@@ -15,7 +15,8 @@ v0.2 (unreleased):
- Added copyvio detector functionality: specifying a max time for checks;
improved exclusion support. URL loading and parsing is parallelized to speed
up check times, with a multi-threaded worker model that avoids concurrent
requests to the same domain. Fixed assorted bugs.
requests to the same domain. Improvements to the comparison algorithm. Fixed
assorted bugs.
- Added support for Wikimedia Labs when creating a config file.
- Added and improved lazy importing for various dependencies.
- Fixed a bug in job scheduling.


+ 12
- 9
earwigbot/exceptions.py View File

@@ -52,6 +52,7 @@ This module contains all exceptions used by EarwigBot::
+-- UnknownSearchEngineError
+-- UnsupportedSearchEngineError
+-- SearchQueryError
+-- ParserExclusionError
"""

class EarwigBotError(Exception):
@@ -231,9 +232,7 @@ class UnknownSearchEngineError(CopyvioCheckError):
:py:attr:`config.wiki["search"]["engine"]`.

Raised by :py:meth:`Page.copyvio_check
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and
:py:meth:`Page.copyvio_compare
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`.
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`.
"""

class UnsupportedSearchEngineError(CopyvioCheckError):
@@ -243,16 +242,20 @@ class UnsupportedSearchEngineError(CopyvioCheckError):
couldn't be imported.

Raised by :py:meth:`Page.copyvio_check
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and
:py:meth:`Page.copyvio_compare
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`.
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`.
"""

class SearchQueryError(CopyvioCheckError):
"""Some error ocurred while doing a search query.

Raised by :py:meth:`Page.copyvio_check
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and
:py:meth:`Page.copyvio_compare
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`.
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`.
"""

class ParserExclusionError(CopyvioCheckError):
"""A content parser detected that the given source should be excluded.

Raised internally by :py:meth:`Page.copyvio_check
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`; should not be
exposed in client code.
"""

+ 2
- 2
earwigbot/wiki/copyvios/__init__.py View File

@@ -118,7 +118,7 @@ class CopyvioMixIn(object):
article = MarkovChain(parser.strip())
workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders,
short_circuit=short_circuit)
short_circuit=short_circuit, detect_exclusions=True)
if self._exclusions_db:
self._exclusions_db.sync(self.site.name)
exclude = lambda u: self._exclusions_db.check(self.site.name, u)
@@ -176,7 +176,7 @@ class CopyvioMixIn(object):
article = MarkovChain(ArticleTextParser(self.get()).strip())
workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders,
max_time, 1)
max_time, num_workers=1)
workspace.enqueue([url])
workspace.wait()
result = workspace.get_result()


+ 3
- 1
earwigbot/wiki/copyvios/exclusions.py View File

@@ -28,7 +28,7 @@ from urlparse import urlparse

from earwigbot import exceptions

__all__ = ["ExclusionsDB"]
__all__ = ["ExclusionsDB", "MIRROR_HINTS"]

DEFAULT_SOURCES = {
"all": [ # Applies to all, but located on enwiki
@@ -43,6 +43,8 @@ DEFAULT_SOURCES = {
]
}

MIRROR_HINTS = ["wikipedia.org/w/"]

class ExclusionsDB(object):
"""
**EarwigBot: Wiki Toolset: Exclusions Database Manager**


+ 1
- 1
earwigbot/wiki/copyvios/markov.py View File

@@ -30,7 +30,7 @@ class MarkovChain(object):
"""Implements a basic ngram Markov chain of words."""
START = -1
END = -2
degree = 3 # 2 for bigrams, 3 for trigrams, etc.
degree = 5 # 2 for bigrams, 3 for trigrams, etc.

def __init__(self, text):
self.text = text


+ 35
- 6
earwigbot/wiki/copyvios/parsers.py View File

@@ -27,6 +27,8 @@ from StringIO import StringIO
import mwparserfromhell

from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError
from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS

bs4 = importer.new("bs4")
nltk = importer.new("nltk")
@@ -58,6 +60,21 @@ class _BaseTextParser(object):
class ArticleTextParser(_BaseTextParser):
"""A parser that can strip and chunk wikicode article text."""
TYPE = "Article"
TEMPLATE_MERGE_THRESHOLD = 35

def _merge_templates(self, code):
"""Merge template contents in to wikicode when the values are long."""
for template in code.filter_templates(recursive=code.RECURSE_OTHERS):
chunks = []
for param in template.params:
if len(param.value) >= self.TEMPLATE_MERGE_THRESHOLD:
self._merge_templates(param.value)
chunks.append(param.value)
if chunks:
subst = u" ".join(map(unicode, chunks))
code.replace(template, u" " + subst + u" ")
else:
code.remove(template)

def strip(self):
"""Clean the page's raw text by removing templates and formatting.
@@ -94,6 +111,9 @@ class ArticleTextParser(_BaseTextParser):
for tag in wikicode.filter_tags(matches=lambda tag: tag.tag == "ref"):
remove(wikicode, tag)

# Merge in template contents when the values are long:
self._merge_templates(wikicode)

clean = wikicode.strip_code(normalize=True, collapse=True)
self.clean = re.sub("\n\n+", "\n", clean).strip()
return self.clean
@@ -167,21 +187,30 @@ class _HTMLParser(_BaseTextParser):
"script", "style"
]

def parse(self):
def parse(self, **kwargs):
"""Return the actual text contained within an HTML document.

Implemented using :py:mod:`BeautifulSoup <bs4>`
(http://www.crummy.com/software/BeautifulSoup/).
"""
try:
soup = bs4.BeautifulSoup(self.text, "lxml").body
soup = bs4.BeautifulSoup(self.text, "lxml")
except ValueError:
soup = bs4.BeautifulSoup(self.text).body
soup = bs4.BeautifulSoup(self.text)

if not soup:
if not soup.body:
# No <body> tag present in HTML ->
# no scrapable content (possibly JS or <frame> magic):
return ""

if kwargs["detect_exclusions"]:
# Look for obvious signs that this is a mirror:
func = lambda attr: attr and any(
hint in attr for hint in MIRROR_HINTS)
if soup.find_all(href=func) or soup.find_all(src=func):
raise ParserExclusionError()

soup = soup.body
is_comment = lambda text: isinstance(text, bs4.element.Comment)
for comment in soup.find_all(text=is_comment):
comment.extract()
@@ -200,7 +229,7 @@ class _PDFParser(_BaseTextParser):
(u"\u2022", u" "),
]

def parse(self):
def parse(self, **kwargs):
"""Return extracted text from the PDF."""
output = StringIO()
manager = pdfinterp.PDFResourceManager()
@@ -226,7 +255,7 @@ class _PlainTextParser(_BaseTextParser):
"""A parser that can unicode-ify and strip text from a plain text page."""
TYPE = "Text"

def parse(self):
def parse(self, **kwargs):
"""Unicode-ify and strip whitespace from the plain text document."""
converted = bs4.UnicodeDammit(self.text).unicode_markup
return converted.strip() if converted else ""


+ 12
- 3
earwigbot/wiki/copyvios/result.py View File

@@ -40,16 +40,21 @@ class CopyvioSource(object):
- :py:attr:`confidence`: the confidence of a violation, between 0 and 1
- :py:attr:`chains`: a 2-tuple of the source chain and the delta chain
- :py:attr:`skipped`: whether this URL was skipped during the check
- :py:attr:`excluded`: whether this URL was in the exclusions list
"""

def __init__(self, workspace, url, headers=None, timeout=5):
def __init__(self, workspace, url, headers=None, timeout=5,
detect_exclusions=False):
self.workspace = workspace
self.url = url
self.headers = headers
self.timeout = timeout
self.detect_exclusions = detect_exclusions

self.confidence = 0.0
self.chains = (EMPTY, EMPTY_INTERSECTION)
self.skipped = False
self.excluded = False

self._event1 = Event()
self._event2 = Event()
@@ -57,11 +62,15 @@ class CopyvioSource(object):

def __repr__(self):
"""Return the canonical string representation of the source."""
res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})"
return res.format(self.url, self.confidence, self.skipped)
res = ("CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r}, "
"excluded={3!r})")
return res.format(
self.url, self.confidence, self.skipped, self.excluded)

def __str__(self):
"""Return a nice string representation of the source."""
if self.excluded:
return "<CopyvioSource ({0}, excluded)>".format(self.url)
if self.skipped:
return "<CopyvioSource ({0}, skipped)>".format(self.url)
res = "<CopyvioSource ({0} with {1} conf)>"


+ 24
- 8
earwigbot/wiki/copyvios/workers.py View File

@@ -34,6 +34,7 @@ from time import time
from urllib2 import build_opener, URLError

from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import get_parser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource
@@ -155,7 +156,8 @@ class _CopyvioWorker(object):
except (IOError, struct_error):
return None

return handler(content).parse()
return handler(content).parse(
detect_exclusions=source.detect_exclusions)

def _acquire_new_site(self):
"""Block for a new unassigned site queue."""
@@ -218,9 +220,15 @@ class _CopyvioWorker(object):
except StopIteration:
self._logger.debug("Exiting: got stop signal")
return
text = self._open_url(source)
chain = MarkovChain(text) if text else None
source.workspace.compare(source, chain)

try:
text = self._open_url(source)
except ParserExclusionError:
source.skipped = source.excluded = True
source.finish_work()
else:
chain = MarkovChain(text) if text else None
source.workspace.compare(source, chain)

def start(self):
"""Start the copyvio worker in a new thread."""
@@ -233,7 +241,8 @@ class CopyvioWorkspace(object):
"""Manages a single copyvio check distributed across threads."""

def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8, short_circuit=True):
url_timeout=5, num_workers=8, short_circuit=True,
detect_exclusions=False):
self.sources = []
self.finished = False
self.possible_miss = False
@@ -247,7 +256,8 @@ class CopyvioWorkspace(object):
self._finish_lock = Lock()
self._short_circuit = short_circuit
self._source_args = {"workspace": self, "headers": headers,
"timeout": url_timeout}
"timeout": url_timeout,
"detect_exclusions": detect_exclusions}

if _is_globalized:
self._queues = _global_queues
@@ -311,11 +321,15 @@ class CopyvioWorkspace(object):
if url in self._handled_urls:
continue
self._handled_urls.add(url)
if exclude_check and exclude_check(url):
continue

source = CopyvioSource(url=url, **self._source_args)
self.sources.append(source)

if exclude_check and exclude_check(url):
self._logger.debug(u"enqueue(): exclude {0}".format(url))
source.excluded = True
source.skip()
continue
if self._short_circuit and self.finished:
self._logger.debug(u"enqueue(): auto-skip {0}".format(url))
source.skip()
@@ -371,6 +385,8 @@ class CopyvioWorkspace(object):
def cmpfunc(s1, s2):
if s2.confidence != s1.confidence:
return 1 if s2.confidence > s1.confidence else -1
if s2.excluded != s1.excluded:
return 1 if s1.excluded else -1
return int(s1.skipped) - int(s2.skipped)

self.sources.sort(cmpfunc)


Loading…
Cancel
Save