From b0002a0ecabd43fb387caa0ce2d91ee100432335 Mon Sep 17 00:00:00 2001 From: Victor Bulatov Date: Tue, 15 Oct 2019 18:12:40 +0300 Subject: [PATCH] add some functionality for English Wikipedia --- mwparserfromhell/nodes/tag.py | 2 ++ mwparserfromhell/nodes/template.py | 17 ++++++++++++++--- mwparserfromhell/nodes/wikilink.py | 23 +++++++++++++++++++++++ 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 70a2876..244b518 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -92,6 +92,8 @@ class Tag(Node): def __strip__(self, **kwargs): if self.contents and is_visible(self.tag): + if self.tag == "ref": + return " " + self.contents.strip_code(**kwargs) + " " return self.contents.strip_code(**kwargs) return None diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 11bccc4..678dfce 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -33,6 +33,18 @@ __all__ = ["Template"] FLAGS = re.DOTALL | re.UNICODE +TEMPLATES = { + "Esp": lambda x: f"* 10^{x.params[0]}", + "smallcaps": lambda x: f"{x.params[0]}", + "Unicode": lambda x: f"{x.params[0]}", + "IPA": lambda x: f"{x.params[0]}", + "transl": lambda x: f"{x.params[-1]}", + "IAST": lambda x: f"{x.params[0]}", + "ssub": lambda x: f"{x.params[0]}", + "SubatomicParticle": lambda x: f"{x.params[0]}", + "convert": lambda x: f"{x.params[0]} {x.params[1]}", +} + class Template(Node): """Represents a template in wikicode, like ``{{foo}}``.""" @@ -59,9 +71,8 @@ class Template(Node): yield param.value def __strip__(self, **kwargs): - if kwargs.get("keep_template_params"): - parts = [param.value.strip_code(**kwargs) for param in self.params] - return " ".join(part for part in parts if part) + if str(self.name) in TEMPLATES: + return TEMPLATES[str(self.name)](self) return None def __showtree__(self, write, get, mark): diff --git a/mwparserfromhell/nodes/wikilink.py b/mwparserfromhell/nodes/wikilink.py index 265a100..934ed79 100644 --- a/mwparserfromhell/nodes/wikilink.py +++ b/mwparserfromhell/nodes/wikilink.py @@ -28,6 +28,9 @@ from ..utils import parse_anything __all__ = ["Wikilink"] +TECHNICAL = set( +"en ceb sv de fr nl ru it es pl war vi ja zh pt ar uk fa sr ca no id ko fi hu sh cs ro eu tr ms eo hy bg ce da he sk zh-min-nan kk min hr et lt be el sl gl azb az nn simple ur th hi ka uz la ta vo cy mk ast tg lv mg tt oc af bs ky sq tl bn zh-yue new te be-tarask br ml pms su nds lb jv ht mr sco szl sw ga ba pnb is my fy cv lmo an ne yo pa bar io gu als ku scn kn bpy ckb wuu ia arz qu mn bat-smg si wa cdo or yi am gd nap bug ilo mai hsb map-bms fo xmf mzn li vec sd eml sah os diq sa ps mrj mhr zh-classical hif nv roa-tara bcl ace hak frr pam nso km se rue mi vls nah bh nds-nl crh gan vep sc ab as bo glk myv co so tk fiu-vro lrc csb kv gv sn udm zea ay ie pcd nrm kab ug stq lez ha kw mwl gom haw gn rm lij lfn lad lo koi mt frp fur dsb dty ext ang ln olo cbk-zam dv bjn ksh gag pi pfl pag av bxr gor xal krc za pap kaa pdc tyv rw to kl nov jam arc kbp kbd tpi tet ig sat ki zu wo na jbo roa-rup lbe bi ty mdf kg lg tcy srn inh xh atj ltg chr sm pih om ak tn cu ts tw rmy bm st chy rn got tum ny ss ch pnt fj iu ady ve ee ks ik sg ff dz ti din cr ng cho kj mh ho ii aa mus hz kr shn hyw".split() + ["category", "file"]) + class Wikilink(Node): """Represents an internal wikilink, like ``[[Foo|Bar]]``.""" @@ -47,6 +50,26 @@ class Wikilink(Node): yield self.text def __strip__(self, **kwargs): + interwiki_id = self.title.partition(":")[0].lower().strip() + if interwiki_id in TECHNICAL: + return "" + if interwiki_id == "image" and self.text: + NOT_CAPTION = [ + "thumb", "frame", "border", "right", "left", "center", "none", + "baseline", "middle", "sub", "super", "text-top", "text-bottom", "top", "bottom", + "upright" + ] + + caption = [] + for entry in self.text.split("|"): + if entry[-2:] == "px": + continue + if any(entry.startswith(prefix) for prefix in NOT_CAPTION): + continue + caption.append(entry) + caption = "|".join(caption) + return parse_anything(caption).strip_code(**kwargs) + if self.text is not None: return self.text.strip_code(**kwargs) return self.title.strip_code(**kwargs)