Browse Source

Primitive screen scraper for HTML using BeautifulSoup and LXML.

Obviously this can and should be improved significantly later, but it seems
good enough for now.
tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
cb87004107
2 changed files with 28 additions and 1 deletions
  1. +26
    -1
      earwigbot/wiki/copyvios/parsers.py
  2. +2
    -0
      setup.py

+ 26
- 1
earwigbot/wiki/copyvios/parsers.py View File

@@ -20,6 +20,13 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import htmlentitydefs

try:
from bs4 import BeautifulSoup
except ImportError:
BeautifulSoup = None

try:
import mwparserfromhell
except ImportError:
@@ -76,6 +83,24 @@ class ArticleTextParser(BaseTextParser):

class HTMLTextParser(BaseTextParser):
"""A parser that can extract the text from an HTML document."""
hidden_tags = [
"script", "style"
]

def strip(self):
return self.text # TODO: NotImplemented
"""Return the actual text contained within an HTML document.

Implemented using :py:mod:`BeautifulSoup <bs4>`
(http://www.crummy.com/software/BeautifulSoup/).
"""
try:
soup = BeautifulSoup(self.text, "lxml").body
except ValueError:
soup = BeautifulSoup(self.text).body

is_comment = lambda text: isinstance(text, bs4.element.Comment)
[comment.extract() for comment in soup.find_all(text=is_comment)]
for tag in self.hidden_tags:
[element.extract() for element in soup.find_all(tag)]

return "\n".join(soup.stripped_strings)

+ 2
- 0
setup.py View File

@@ -34,6 +34,8 @@ setup(
entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]},
install_requires = ["GitPython >= 0.3.2.RC1", # Interfacing with git
"PyYAML >= 3.10", # Config parsing
"beautifulsoup4 >= 4.1.1", # HTML parsing/scraping
"lxml >= 2.3.4", # Faster parser for BeautifulSoup
"mwparserfromhell >= 0.1", # Wikicode parsing
"oursql >= 0.9.3", # Talking with MediaWiki databases
"oauth2 >= 1.5.211", # Talking with Yahoo BOSS Search


Loading…
Cancel
Save