Ver código fonte

Pushing some smarter logic for MarkovChains

- Incomplete; need this for the TS rewrite
- Also starting work on docstrings for some methods
tags/v0.1^2
Ben Kurtovic 12 anos atrás
pai
commit
f382ceb38e
1 arquivos alterados com 23 adições e 1 exclusões
  1. +23
    -1
      earwigbot/wiki/copyright.py

+ 23
- 1
earwigbot/wiki/copyright.py Ver arquivo

@@ -24,6 +24,7 @@ from collections import defaultdict
from functools import partial
from gzip import GzipFile
from json import loads
from re import sub, UNICODE
from StringIO import StringIO
from time import sleep, time
from urllib import quote_plus, urlencode
@@ -62,7 +63,7 @@ class _MarkovChain(object):
def __init__(self, text):
self.text = text
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
words = text.split()
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
prev = self.START
for word in words:
self.chain[prev][word] += 1
@@ -185,15 +186,36 @@ class CopyrightMixin(object):
return [result["url"] for result in results]

def _copyvio_strip_html(self, html):
"""
STUB
"""
return html

def _copyvio_strip_article(self, content):
"""Clean the page's raw text by removing templates and formatting.

Returns the page's text with all HTML and wikicode formatting removed,
including templates, tables, references, and the Bibliography/
References/Sources/See also section(s). It retains punctuation
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
quotes) and original capitalization, but not brackets (square and
angular), abnormal spacing, nor anything else. HTML entities are
replaced by their unicode equivalents.

STUB
"""
return content

def _copyvio_chunk_article(self, content, max_chunks):
"""
STUB
"""
return [content]

def _copyvio_compare_content(self, article, url):
"""
DOCSTRING NEEDED
"""
html = self._open_url_ignoring_errors(url)
if not html:
return 0


Carregando…
Cancelar
Salvar