Browse Source

copyvios: use a different timeout for direct URL comparisons.

tags/v0.2
Ben Kurtovic 10 years ago
parent
commit
5e9d4cfa78
1 changed files with 6 additions and 6 deletions
  1. +6
    -6
      earwigbot/wiki/copyvios/__init__.py

+ 6
- 6
earwigbot/wiki/copyvios/__init__.py View File

@@ -55,7 +55,7 @@ class CopyvioMixIn(object):
self._opener = build_opener() self._opener = build_opener()
self._opener.addheaders = site._opener.addheaders self._opener.addheaders = site._opener.addheaders


def _open_url_ignoring_errors(self, url):
def _open_url_ignoring_errors(self, url, max_time=5):
"""Open a URL and return its parsed content, or None. """Open a URL and return its parsed content, or None.


First, we will decompress the content if the headers contain "gzip" as First, we will decompress the content if the headers contain "gzip" as
@@ -68,7 +68,7 @@ class CopyvioMixIn(object):
while decompressing, None will be returned. while decompressing, None will be returned.
""" """
try: try:
response = self._opener.open(url.encode("utf8"), timeout=5)
response = self._opener.open(url.encode("utf8"), timeout=max_time)
result = response.read() result = response.read()
except (URLError, timeout): except (URLError, timeout):
return None return None
@@ -115,13 +115,13 @@ class CopyvioMixIn(object):


raise exceptions.UnknownSearchEngineError(engine) raise exceptions.UnknownSearchEngineError(engine)


def _copyvio_compare_content(self, article, url):
def _copyvio_compare_content(self, article, url, max_time=5):
"""Return a number comparing an article and a URL. """Return a number comparing an article and a URL.


The *article* is a Markov chain, whereas the *url* is just a string The *article* is a Markov chain, whereas the *url* is just a string
that we'll try to open and read ourselves. that we'll try to open and read ourselves.
""" """
text = self._open_url_ignoring_errors(url)
text = self._open_url_ignoring_errors(url, max_time)
if not text: if not text:
return 0, (self.EMPTY, self.EMPTY_INTERSECTION) return 0, (self.EMPTY, self.EMPTY_INTERSECTION)


@@ -216,7 +216,7 @@ class CopyvioMixIn(object):
num_queries, ctime, article_chain, num_queries, ctime, article_chain,
best_chains) best_chains)


def copyvio_compare(self, url, min_confidence=0.5):
def copyvio_compare(self, url, min_confidence=0.5, max_time=15):
"""Check the page like :py:meth:`copyvio_check` against a specific URL. """Check the page like :py:meth:`copyvio_check` against a specific URL.


This is essentially a reduced version of the above - a copyivo This is essentially a reduced version of the above - a copyivo
@@ -247,7 +247,7 @@ class CopyvioMixIn(object):
chns = (self.EMPTY, self.EMPTY_INTERSECTION) chns = (self.EMPTY, self.EMPTY_INTERSECTION)
return CopyvioCheckResult(False, 0, url, 0, 0, article_chain, chns) return CopyvioCheckResult(False, 0, url, 0, 0, article_chain, chns)


confidence, chains = self._copyvio_compare_content(article_chain, url)
confidence, chains = self._copyvio_compare_content(article_chain, url, max_time)
ctime = time() - start_time ctime = time() - start_time
if confidence >= min_confidence: if confidence >= min_confidence:
is_violation = True is_violation = True


Loading…
Cancel
Save