Browse Source

Pushing some smarter logic for MarkovChains

- Incomplete; need this for the TS rewrite
- Also starting work on docstrings for some methods
tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
f382ceb38e
1 changed files with 23 additions and 1 deletions
  1. +23
    -1
      earwigbot/wiki/copyright.py

+ 23
- 1
earwigbot/wiki/copyright.py View File

@@ -24,6 +24,7 @@ from collections import defaultdict
from functools import partial from functools import partial
from gzip import GzipFile from gzip import GzipFile
from json import loads from json import loads
from re import sub, UNICODE
from StringIO import StringIO from StringIO import StringIO
from time import sleep, time from time import sleep, time
from urllib import quote_plus, urlencode from urllib import quote_plus, urlencode
@@ -62,7 +63,7 @@ class _MarkovChain(object):
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) self.chain = defaultdict(lambda: defaultdict(lambda: 0))
words = text.split()
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
prev = self.START prev = self.START
for word in words: for word in words:
self.chain[prev][word] += 1 self.chain[prev][word] += 1
@@ -185,15 +186,36 @@ class CopyrightMixin(object):
return [result["url"] for result in results] return [result["url"] for result in results]


def _copyvio_strip_html(self, html): def _copyvio_strip_html(self, html):
"""
STUB
"""
return html return html


def _copyvio_strip_article(self, content): def _copyvio_strip_article(self, content):
"""Clean the page's raw text by removing templates and formatting.

Returns the page's text with all HTML and wikicode formatting removed,
including templates, tables, references, and the Bibliography/
References/Sources/See also section(s). It retains punctuation
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
quotes) and original capitalization, but not brackets (square and
angular), abnormal spacing, nor anything else. HTML entities are
replaced by their unicode equivalents.

STUB
"""
return content return content


def _copyvio_chunk_article(self, content, max_chunks): def _copyvio_chunk_article(self, content, max_chunks):
"""
STUB
"""
return [content] return [content]


def _copyvio_compare_content(self, article, url): def _copyvio_compare_content(self, article, url):
"""
DOCSTRING NEEDED
"""
html = self._open_url_ignoring_errors(url) html = self._open_url_ignoring_errors(url)
if not html: if not html:
return 0 return 0


Loading…
Cancel
Save