Procházet zdrojové kódy

copyvios: use a different timeout for direct URL comparisons.

tags/v0.2
Ben Kurtovic před 10 roky
rodič
revize
5e9d4cfa78
1 změnil soubory, kde provedl 6 přidání a 6 odebrání
  1. +6
    -6
      earwigbot/wiki/copyvios/__init__.py

+ 6
- 6
earwigbot/wiki/copyvios/__init__.py Zobrazit soubor

@@ -55,7 +55,7 @@ class CopyvioMixIn(object):
self._opener = build_opener() self._opener = build_opener()
self._opener.addheaders = site._opener.addheaders self._opener.addheaders = site._opener.addheaders


def _open_url_ignoring_errors(self, url):
def _open_url_ignoring_errors(self, url, max_time=5):
"""Open a URL and return its parsed content, or None. """Open a URL and return its parsed content, or None.


First, we will decompress the content if the headers contain "gzip" as First, we will decompress the content if the headers contain "gzip" as
@@ -68,7 +68,7 @@ class CopyvioMixIn(object):
while decompressing, None will be returned. while decompressing, None will be returned.
""" """
try: try:
response = self._opener.open(url.encode("utf8"), timeout=5)
response = self._opener.open(url.encode("utf8"), timeout=max_time)
result = response.read() result = response.read()
except (URLError, timeout): except (URLError, timeout):
return None return None
@@ -115,13 +115,13 @@ class CopyvioMixIn(object):


raise exceptions.UnknownSearchEngineError(engine) raise exceptions.UnknownSearchEngineError(engine)


def _copyvio_compare_content(self, article, url):
def _copyvio_compare_content(self, article, url, max_time=5):
"""Return a number comparing an article and a URL. """Return a number comparing an article and a URL.


The *article* is a Markov chain, whereas the *url* is just a string The *article* is a Markov chain, whereas the *url* is just a string
that we'll try to open and read ourselves. that we'll try to open and read ourselves.
""" """
text = self._open_url_ignoring_errors(url)
text = self._open_url_ignoring_errors(url, max_time)
if not text: if not text:
return 0, (self.EMPTY, self.EMPTY_INTERSECTION) return 0, (self.EMPTY, self.EMPTY_INTERSECTION)


@@ -216,7 +216,7 @@ class CopyvioMixIn(object):
num_queries, ctime, article_chain, num_queries, ctime, article_chain,
best_chains) best_chains)


def copyvio_compare(self, url, min_confidence=0.5):
def copyvio_compare(self, url, min_confidence=0.5, max_time=15):
"""Check the page like :py:meth:`copyvio_check` against a specific URL. """Check the page like :py:meth:`copyvio_check` against a specific URL.


This is essentially a reduced version of the above - a copyivo This is essentially a reduced version of the above - a copyivo
@@ -247,7 +247,7 @@ class CopyvioMixIn(object):
chns = (self.EMPTY, self.EMPTY_INTERSECTION) chns = (self.EMPTY, self.EMPTY_INTERSECTION)
return CopyvioCheckResult(False, 0, url, 0, 0, article_chain, chns) return CopyvioCheckResult(False, 0, url, 0, 0, article_chain, chns)


confidence, chains = self._copyvio_compare_content(article_chain, url)
confidence, chains = self._copyvio_compare_content(article_chain, url, max_time)
ctime = time() - start_time ctime = time() - start_time
if confidence >= min_confidence: if confidence >= min_confidence:
is_violation = True is_violation = True


Načítá se…
Zrušit
Uložit