Преглед изворни кода

Merge branch 'feature/wptagger' into develop

tags/v0.1^2
Ben Kurtovic пре 12 година
родитељ
комит
2a49016f0b
2 измењених фајлова са 300 додато и 5 уклоњено
  1. +0
    -1
      docs/api/earwigbot.tasks.rst
  2. +300
    -4
      earwigbot/tasks/wikiproject_tagger.py

+ 0
- 1
docs/api/earwigbot.tasks.rst Прегледај датотеку

@@ -13,5 +13,4 @@ tasks Package

.. automodule:: earwigbot.tasks.wikiproject_tagger
:members:
:undoc-members:
:show-inheritance:

+ 300
- 4
earwigbot/tasks/wikiproject_tagger.py Прегледај датотеку

@@ -20,14 +20,310 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import re

from earwigbot import exceptions
from earwigbot.tasks import Task
from earwigbot.wiki import constants

class WikiProjectTagger(Task):
"""A task to tag talk pages with WikiProject Banners."""
"""A task to tag talk pages with WikiProject banners.

Usage: :command:`earwigbot -t wikiproject_tagger PATH
--banner BANNER (--category CAT | --file FILE) [--summary SUM]
[--append TEXT] [--autoassess] [--nocreate] [--recursive NUM]
[--site SITE]`

.. glossary::

``--banner BANNER``
the page name of the banner to add, without a namespace (unless the
namespace is something other than ``Template``) so
``--banner WikiProject Biography`` for ``{{WikiProject Biography}}``
``--category CAT`` or ``--file FILE``
determines which pages to tag; either all pages in a category (to
include subcategories as well, see ``--recursive``) or all
pages/categories in a file (utf-8 encoded and path relative to the
current directory)
``--summary SUM``
an optional edit summary to use; defaults to
``"Adding WikiProject banner {{BANNER}}."``
``--append TEXT``
optional text to append to the banner (after an autoassessment, if
any), like ``|importance=low``
``--autoassess``
try to assess each article's class automatically based on the class of
other banners on the same page
``--nocreate``
don't create new talk pages with just a banner if the page doesn't
already exist
``--recursive NUM``
recursively go through subcategories up to a maximum depth of ``NUM``,
or if ``NUM`` isn't provided, go infinitely (this can be dangerous)
``--site SITE``
the ID of the site to tag pages on, defaulting to the... default site

"""
name = "wikiproject_tagger"

def setup(self):
pass
# Regexes for template names that should always go above the banner, based
# on [[Wikipedia:Talk page layout]]:
TOP_TEMPS = [
r"skip ?to ?(toc|talk|toctalk)$",

r"ga ?nominee$",

r"(user ?)?talk ?(header|page|page ?header)$",

r"community ?article ?probation$",
r"censor(-nudity)?$",
r"blp(o| ?others?)?$",
r"controvers(ial2?|y)$",

r"(not ?(a ?)?)?forum$",
r"tv(episode|series)talk$",
r"recurring ?themes$",
r"faq$",
r"(round ?in ?)?circ(les|ular)$",

r"ar(ti|it)cle ?(history|milestones)$",
r"failed ?ga$",
r"old ?prod( ?full)?$",
r"(old|previous) ?afd$",

r"((wikiproject|wp) ?)?bio(graph(y|ies))?$",
]

def _upperfirst(self, text):
"""Try to uppercase the first letter of a string."""
try:
return text[0].upper() + text[1:]
except IndexError:
return text

def run(self, **kwargs):
pass
"""Main entry point for the bot task."""
if "file" not in kwargs and "category" not in kwargs:
log = "No pages to tag; I need either a 'category' or a 'file' passed as kwargs"
self.logger.error(log)
return
if "banner" not in kwargs:
log = "Needs a banner to add passed as the 'banner' kwarg"
self.logger.error(log)
return

site = self.bot.wiki.get_site(name=kwargs.get("site"))
banner = kwargs["banner"]
summary = kwargs.get("summary", "Adding WikiProject banner $3.")
append = kwargs.get("append")
autoassess = kwargs.get("autoassess", False)
nocreate = kwargs.get("nocreate", False)
recursive = kwargs.get("recursive", 0)
banner, names = self.get_names(site, banner)
if not names:
return
job = _Job(banner, names, summary, append, autoassess, nocreate)

try:
self.run_job(kwargs, site, job, recursive)
except _ShutoffEnabled:
return

def run_job(self, kwargs, site, job, recursive):
"""Run a tagging *job* on a given *site*."""
if "category" in kwargs:
title = kwargs["category"]
title = self.guess_namespace(site, title, constants.NS_CATEGORY)
self.process_category(site.get_page(title), job, recursive)

if "file" in kwargs:
with open(kwargs["file"], "r") as fileobj:
for line in fileobj:
if line.strip():
line = line.decode("utf8")
if line.startswith("[[") and line.endswith("]]"):
line = line[2:-2]
page = site.get_page(line)
if page.namespace == constants.NS_CATEGORY:
self.process_category(page, job, recursive)
else:
self.process_page(page, job)

def guess_namespace(self, site, title, assumed):
"""If the given *title* does not have an explicit namespace, guess it.

For example, when transcluding templates, the namespace is guessed to
be ``NS_TEMPLATE`` unless one is explicitly declared (so ``{{foo}}`` ->
``[[Template:Foo]]``, but ``{{:foo}}`` -> ``[[Foo]]``).
"""
prefix = title.split(":", 1)[0]
if prefix == title:
return u":".join((site.namespace_id_to_name(assumed), title))
try:
site.namespace_name_to_id(prefix)
except exceptions.NamespaceNotFoundError:
return u":".join((site.namespace_id_to_name(assumed), title))
return title

def get_names(self, site, banner):
"""Return all possible aliases for a given *banner* template."""
title = self.guess_namespace(site, banner, constants.NS_TEMPLATE)
if title == banner:
banner = banner.split(":", 1)[1]
page = site.get_page(title)
if page.exists != page.PAGE_EXISTS:
self.logger.error(u"Banner [[{0}]] does not exist".format(title))
return banner, None

if banner == title:
names = [self._upperfirst(banner)]
else:
names = [self._upperfirst(banner), self._upperfirst(title)]
result = site.api_query(action="query", list="backlinks", bllimit=500,
blfilterredir="redirects", bltitle=title)
for backlink in result["query"]["backlinks"]:
names.append(backlink["title"])
if backlink["ns"] == constants.NS_TEMPLATE:
names.append(backlink["title"].split(":", 1)[1])

log = u"Found {0} aliases for banner [[{1}]]".format(len(names), title)
self.logger.debug(log)
return banner, names

def process_category(self, page, job, recursive):
"""Try to tag all pages in the given category."""
self.logger.info(u"Processing category: [[{0]]".format(page.title))
for member in page.get_members():
if member.namespace == constants.NS_CATEGORY:
if recursive is True:
self.process_category(member, job, True)
elif recursive:
self.process_category(member, job, recursive - 1)
else:
self.process_page(member, job)

def process_page(self, page, job):
"""Try to tag a specific *page* using the *job* description."""
if job.counter % 10 == 0: # Do a shutoff check every ten pages
if self.shutoff_enabled(page.site):
raise _ShutoffEnabled()
job.counter += 1

if not page.is_talkpage:
page = page.toggle_talk()
try:
code = page.parse()
except exceptions.PageNotFoundError:
if job.nocreate:
log = u"Skipping nonexistent page: [[{0}]]".format(page.title)
self.logger.info(log)
else:
log = u"Tagging new page: [[{0}]]".format(page.title)
self.logger.info(log)
banner = "{{" + job.banner + job.append + "}}"
summary = job.summary.replace("$3", banner)
page.edit(banner, self.make_summary(summary))
return
except exceptions.InvalidPageError:
log = u"Skipping invalid page: [[{0}]]".format(page.title)
self.logger.error(log)
return

for template in code.ifilter_templates(recursive=True):
name = self._upperfirst(template.name.strip())
if name in job.names:
log = u"Skipping page: [[{0}]]; already tagged with '{1}'"
self.logger.info(log.format(page.title, name))
return

banner = self.make_banner(job, code)
shell = self.get_banner_shell(code)
if shell:
if shell.has_param(1):
shell.get(1).value.insert(0, banner + "\n")
else:
shell.add(1, banner)
else:
self.add_banner(code, banner)
self.apply_genfixes(code)

self.logger.info(u"Tagging page: [[{0}]]".format(page.title))
summary = job.summary.replace("$3", banner)
page.edit(unicode(code), self.make_summary(summary))

def make_banner(self, job, code):
"""Return banner text to add based on a *job* and a page's *code*."""
banner = "{{" + job.banner
if job.autoassess:
classes = {"fa": 0, "fl": 0, "ga": 0, "a": 0, "b": 0, "start": 0,
"stub": 0, "list": 0, "dab": 0, "c": 0, "redirect": 0,
"book": 0, "template": 0, "category": 0}
for template in code.ifilter_templates(recursive=True):
if template.has_param("class"):
value = unicode(template.get("class").value).lower()
if value in classes:
classes[value] += 1
values = tuple(classes.values())
best = max(values)
confidence = float(best) / sum(values)
if confidence > 0.75:
rank = tuple(classes.keys())[values.index(best)]
if rank in ("fa", "fl", "ga"):
banner += "|class=" + rank.upper()
else:
banner += "|class=" + self._upperfirst(rank)
return banner + job.append + "}}"

def get_banner_shell(self, code):
"""Return the banner shell template within *code*, else ``None``."""
regex = r"^\{\{\s*((WikiProject|WP)[ _]?Banner[ _]?S(hell)?|W(BPS|PBS|PB)|Shell)"
shells = code.filter_templates(matches=regex)
if not shells:
shells = code.filter_templates(matches=regex, recursive=True)
if shells:
log = u"Inserting banner into shell: {0}"
self.logger.debug(log.format(shells[0].name))
return shells[0]

def add_banner(self, code, banner):
"""Add *banner* to *code*, following template order conventions."""
index = 0
for i, template in enumerate(code.ifilter_templates()):
name = template.name.lower().replace("_", " ")
for regex in self.TOP_TEMPS:
if re.match(regex, name):
self.logger.info("Skipping top template: {0}".format(name))
index = i + 1

self.logger.debug(u"Inserting banner at index {0}".format(index))
code.insert(index, banner)

def apply_genfixes(self, code):
"""Apply general fixes to *code*, such as template substitution."""
regex = r"^\{\{\s*((un|no)?s(i((gn|ng)(ed3?)?|g))?|usu|tilde|forgot to sign|without signature)"
for template in code.ifilter_templates(matches=regex):
self.logger.debug("Applying genfix: substitute {{unsigned}}")
template.name = "subst:unsigned"


class _Job(object):
"""Represents a single wikiproject-tagging task.

Stores information on the banner to add, the edit summary to use, whether
or not to autoassess and create new pages from scratch, and a counter of
the number of pages edited.
"""
def __init__(self, banner, names, summary, append, autoassess, nocreate):
self.banner = banner
self.names = names
self.summary = summary
self.append = append
self.autoassess = autoassess
self.nocreate = nocreate
self.counter = 0


class _ShutoffEnabled(Exception):
"""Raised by process_page() if shutoff is enabled. Caught by run(), which
will then stop the task."""
pass

Loading…
Откажи
Сачувај