Ver código fonte

Pushing some smarter logic for MarkovChains

- Incomplete; need this for the TS rewrite
- Also starting work on docstrings for some methods
tags/v0.1^2
Ben Kurtovic 12 anos atrás
pai
commit
f382ceb38e
1 arquivos alterados com 23 adições e 1 exclusões
  1. +23
    -1
      earwigbot/wiki/copyright.py

+ 23
- 1
earwigbot/wiki/copyright.py Ver arquivo

@@ -24,6 +24,7 @@ from collections import defaultdict
from functools import partial from functools import partial
from gzip import GzipFile from gzip import GzipFile
from json import loads from json import loads
from re import sub, UNICODE
from StringIO import StringIO from StringIO import StringIO
from time import sleep, time from time import sleep, time
from urllib import quote_plus, urlencode from urllib import quote_plus, urlencode
@@ -62,7 +63,7 @@ class _MarkovChain(object):
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) self.chain = defaultdict(lambda: defaultdict(lambda: 0))
words = text.split()
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
prev = self.START prev = self.START
for word in words: for word in words:
self.chain[prev][word] += 1 self.chain[prev][word] += 1
@@ -185,15 +186,36 @@ class CopyrightMixin(object):
return [result["url"] for result in results] return [result["url"] for result in results]


def _copyvio_strip_html(self, html): def _copyvio_strip_html(self, html):
"""
STUB
"""
return html return html


def _copyvio_strip_article(self, content): def _copyvio_strip_article(self, content):
"""Clean the page's raw text by removing templates and formatting.

Returns the page's text with all HTML and wikicode formatting removed,
including templates, tables, references, and the Bibliography/
References/Sources/See also section(s). It retains punctuation
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
quotes) and original capitalization, but not brackets (square and
angular), abnormal spacing, nor anything else. HTML entities are
replaced by their unicode equivalents.

STUB
"""
return content return content


def _copyvio_chunk_article(self, content, max_chunks): def _copyvio_chunk_article(self, content, max_chunks):
"""
STUB
"""
return [content] return [content]


def _copyvio_compare_content(self, article, url): def _copyvio_compare_content(self, article, url):
"""
DOCSTRING NEEDED
"""
html = self._open_url_ignoring_errors(url) html = self._open_url_ignoring_errors(url)
if not html: if not html:
return 0 return 0


Carregando…
Cancelar
Salvar