From 1cf0754e738b319769eed61d388559e161c201bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 3 Jan 2020 23:40:46 +0100 Subject: [PATCH] Proposed fix for https://github.com/earwig/mwparserfromhell/issues/197 --- mwparserfromhell/parser/tokenizer.py | 3 ++- tests/tokenizer/external_links.mwtest | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index ab61f92..8e4b79d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -393,7 +393,8 @@ class Tokenizer: # scheme since it was just parsed as text: for chunk in reversed(self._textbuffer): for char in reversed(chunk): - if char.isspace() or char in self.MARKERS: + # stop at the first non-word character + if re.fullmatch(r"\W", char): raise StopIteration() if char not in valid: raise BadRoute() diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index d2efdfc..5e581f9 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -478,3 +478,17 @@ name: brackets_scheme_title_but_no_url label: brackets around a scheme, colon, and slashes, with a title input: "[http:// Example]" output: [Text(text="[http:// Example]")] + +--- + +name: url_preceded_by_non_word_character +label: non-word character immediately before a valid URL +input: "svn+ssh://server.domain.com:/reponame" +output: [Text(text="svn+"), ExternalLinkOpen(brackets=False), Text(text="ssh://server.domain.com:/reponame"), ExternalLinkClose()] + +--- + +name: url_preceded_by_underscore +label: underscore immediately before a valid URL +input: "svn_ssh://server.domain.com:/reponame" +output: [Text(text="svn_ssh://server.domain.com:/reponame")]