Browse Source

Docs are good

master
Brett Langdon 9 years ago
parent
commit
d1b25d0762
No known key found for this signature in database GPG Key ID: A2ECAB73CE12147F
8 changed files with 962 additions and 3 deletions
  1. +3
    -0
      .gitignore
  2. +52
    -1
      README.rst
  3. +225
    -0
      docs/Makefile
  4. +428
    -0
      docs/conf.py
  5. +56
    -0
      docs/index.rst
  6. +1
    -0
      soup_schema/error.py
  7. +34
    -2
      soup_schema/schema.py
  8. +163
    -0
      soup_schema/selector.py

+ 3
- 0
.gitignore View File

@ -2,3 +2,6 @@
*.egg-info *.egg-info
dist/ dist/
build/ build/
docs/_build
docs/_static
docs/_templates

+ 52
- 1
README.rst View File

@ -1,2 +1,53 @@
soup-schema soup-schema
~~~~~~~~~~~
===========
Define schemas for parsing HTML with BeautifulSoup4_.
.. _BeautifulSoup4: https://www.crummy.com/software/BeautifulSoup/
Installing
----------
.. code:: bash
pip install soup_schema
Example usage
-------------
.. code:: python
from soup_schema import Schema, Selector, AttrSelector
class PageSchema(Schema):
content = Selector('#content', required=True)
description = Selector('[name=description]')
stylesheets = AttrSelector('[rel=stylesheet]', 'href', as_list=True)
title = Selector('title', required=True)
html = """
<html>
<head>
<title>My page title</title>
<link rel="stylesheet" href="/dist/css/third-party.css" />
<link rel="stylesheet" href="/dist/css/style.css" />
<meta name="description" content="This is my page description" />
</head>
<body>
<div id="content">
<p>This is my page content</p>
</div>
</body>
</html>
"""
page = PageSchema.parse(html)
print(page)
# PageSchema(
# content='\nThis is my page content\n',
# description='This is my page description',
# stylesheets=['/dist/css/third-party.css', '/dist/css/style.css'],
# title='My page title'
# )

+ 225
- 0
docs/Makefile View File

@ -0,0 +1,225 @@
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " applehelp to make an Apple Help Book"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " epub3 to make an epub3"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
@echo " coverage to run coverage check of the documentation (if enabled)"
@echo " dummy to check syntax errors of document sources"
.PHONY: clean
clean:
rm -rf $(BUILDDIR)/*
.PHONY: html
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
.PHONY: dirhtml
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
.PHONY: singlehtml
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
.PHONY: pickle
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
.PHONY: json
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
.PHONY: htmlhelp
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
.PHONY: qthelp
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/soup_schema.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/soup_schema.qhc"
.PHONY: applehelp
applehelp:
$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
@echo
@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
@echo "N.B. You won't be able to view it unless you put it in" \
"~/Library/Documentation/Help or install it in your application" \
"bundle."
.PHONY: devhelp
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/soup_schema"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/soup_schema"
@echo "# devhelp"
.PHONY: epub
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
.PHONY: epub3
epub3:
$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
@echo
@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
.PHONY: latex
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
.PHONY: latexpdf
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
.PHONY: latexpdfja
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
.PHONY: text
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
.PHONY: man
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
.PHONY: texinfo
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
.PHONY: info
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
.PHONY: gettext
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
.PHONY: changes
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
.PHONY: linkcheck
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
.PHONY: doctest
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
.PHONY: coverage
coverage:
$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
@echo "Testing of coverage in the sources finished, look at the " \
"results in $(BUILDDIR)/coverage/python.txt."
.PHONY: xml
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
.PHONY: pseudoxml
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
.PHONY: dummy
dummy:
$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
@echo
@echo "Build finished. Dummy builder generates no files."

+ 428
- 0
docs/conf.py View File

@ -0,0 +1,428 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# soup_schema documentation build configuration file, created by
# sphinx-quickstart on Sat Sep 10 08:49:41 2016.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.todo',
'sphinx.ext.viewcode',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
# The encoding of source files.
#
# source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = 'soup_schema'
copyright = '2016, Brett Langdon <me@brett.is>'
author = 'Brett Langdon'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = '0.1.0'
# The full version, including alpha/beta/rc tags.
release = '0.1.0'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = 'en'
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#
# today = ''
#
# Else, today_fmt is used as the format for a strftime call.
#
# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# The reST default role (used for this markup: `text`) to use for all
# documents.
#
# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#
# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#
# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
# modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
# keep_warnings = False
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = True
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'alabaster'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
# html_theme_path = []
# The name for this set of Sphinx documents.
# "<project> v<release> documentation" by default.
#
# html_title = 'soup_schema v'
# A shorter title for the navigation bar. Default is the same as html_title.
#
# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#
# html_logo = None
# The name of an image file (relative to this directory) to use as a favicon of
# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#
# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
#
# html_extra_path = []
# If not None, a 'Last updated on:' timestamp is inserted at every page
# bottom, using the given strftime format.
# The empty string is equivalent to '%b %d, %Y'.
#
# html_last_updated_fmt = None
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#
# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#
# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#
# html_additional_pages = {}
# If false, no module index is generated.
#
# html_domain_indices = True
# If false, no index is generated.
#
# html_use_index = True
# If true, the index is split into individual pages for each letter.
#
# html_split_index = False
# If true, links to the reST sources are added to the pages.
#
# html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#
# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#
# html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#
# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
# html_file_suffix = None
# Language to be used for generating the HTML full-text search index.
# Sphinx supports the following languages:
# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh'
#
# html_search_language = 'en'
# A dictionary with options for the search language support, empty by default.
# 'ja' uses this config value.
# 'zh' user can custom change `jieba` dictionary path.
#
# html_search_options = {'type': 'default'}
# The name of a javascript file (relative to the configuration directory) that
# implements a search results scorer. If empty, the default will be used.
#
# html_search_scorer = 'scorer.js'
# Output file base name for HTML help builder.
htmlhelp_basename = 'soup_schemadoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'soup_schema.tex', 'soup\\_schema Documentation',
'Author', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#
# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#
# latex_use_parts = False
# If true, show page references after internal links.
#
# latex_show_pagerefs = False
# If true, show URL addresses after external links.
#
# latex_show_urls = False
# Documents to append as an appendix to all manuals.
#
# latex_appendices = []
# It false, will not define \strong, \code, itleref, \crossref ... but only
# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
# packages.
#
# latex_keep_old_macro_names = True
# If false, no module index is generated.
#
# latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'soup_schema', 'soup_schema Documentation',
[author], 1)
]
# If true, show URL addresses after external links.
#
# man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'soup_schema', 'soup_schema Documentation',
author, 'soup_schema', 'One line description of project.',
'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#
# texinfo_appendices = []
# If false, no module index is generated.
#
# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#
# texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
#
# texinfo_no_detailmenu = False
# -- Options for Epub output ----------------------------------------------
# Bibliographic Dublin Core info.
epub_title = project
epub_author = author
epub_publisher = author
epub_copyright = copyright
# The basename for the epub file. It defaults to the project name.
# epub_basename = project
# The HTML theme for the epub output. Since the default themes are not
# optimized for small screen space, using the same theme for HTML and epub
# output is usually not wise. This defaults to 'epub', a theme designed to save
# visual space.
#
# epub_theme = 'epub'
# The language of the text. It defaults to the language option
# or 'en' if the language is not set.
#
# epub_language = ''
# The scheme of the identifier. Typical schemes are ISBN or URL.
# epub_scheme = ''
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#
# epub_identifier = ''
# A unique identification for the text.
#
# epub_uid = ''
# A tuple containing the cover image and cover page html template filenames.
#
# epub_cover = ()
# A sequence of (type, uri, title) tuples for the guide element of content.opf.
#
# epub_guide = ()
# HTML files that should be inserted before the pages created by sphinx.
# The format is a list of tuples containing the path and title.
#
# epub_pre_files = []
# HTML files that should be inserted after the pages created by sphinx.
# The format is a list of tuples containing the path and title.
#
# epub_post_files = []
# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']
# The depth of the table of contents in toc.ncx.
#
# epub_tocdepth = 3
# Allow duplicate toc entries.
#
# epub_tocdup = True
# Choose between 'default' and 'includehidden'.
#
# epub_tocscope = 'default'
# Fix unsupported image types using the Pillow.
#
# epub_fix_images = False
# Scale large images.
#
# epub_max_image_width = 0
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#
# epub_show_urls = 'inline'
# If false, no index is generated.
#
# epub_use_index = True

+ 56
- 0
docs/index.rst View File

@ -0,0 +1,56 @@
.. soup_schema documentation master file, created by
sphinx-quickstart on Sat Sep 10 08:49:41 2016.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
.. toctree::
:maxdepth: 4
.. include:: ../README.rst
API documentation
-----------------
soup_schema
***********
All submodule classes are exposed and importable directly from ``soup_schema``.
- ``soup_schema.Schema`` - :class:`soup_schema.schema.Schema`
- ``soup_schema.Selector`` - :class:`soup_schema.selector.Selector`
- ``soup_schema.AnySelector`` - :class:`soup_schema.selector.AnySelector`
- ``soup_schema.AttrSelector`` - :class:`soup_schema.selector.AttrSelector`
- ``soup_schema.SchemaSelector`` - :class:`soup_schema.selector.SchemaSelector`
- ``soup_schema.ValidationError`` - :class:`soup_schema.error.ValidationError`
Schema
~~~~~~
.. automodule:: soup_schema.schema
:members:
:undoc-members:
:show-inheritance:
Selector
~~~~~~~~
.. automodule:: soup_schema.selector
:members:
:undoc-members:
:show-inheritance:
Error
~~~~~
.. automodule:: soup_schema.error
:members:
:undoc-members:
:show-inheritance:
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

+ 1
- 0
soup_schema/error.py View File

@ -1,2 +1,3 @@
class ValidationError(Exception): class ValidationError(Exception):
"""soup_schema error raised when a :class:`Selector` could not be parsed"""
pass pass

+ 34
- 2
soup_schema/schema.py View File

@ -4,16 +4,48 @@ from .selector import Selector
class Schema(object): class Schema(object):
__version__ = 1
"""
Base class to inherit from for defining custom HTML schemas
:Example:
.. code:: python
class CustomSchema(Schema):
# Parse the `<title></title>` element from the document
title = Selector('title', required=True)
# ... define other selectors here
html = \"\"\"
<html>
<head>
<title>My page title</title>
</head>
<body>
</body>
</html>
\"\"\"
parsed = CustomSchema.parse(html)
"""
@classmethod @classmethod
def _get_selectors(cls): def _get_selectors(cls):
"""Helper to get all the selectors defined on this Schema"""
for name, value in cls.__dict__.items(): for name, value in cls.__dict__.items():
if isinstance(value, Selector): if isinstance(value, Selector):
yield name, value yield name, value
@classmethod @classmethod
def parse(cls, html): def parse(cls, html):
"""
Parse the provided html document into this schema.
:param html: The text content of the HTML document to parse
:type html: (str, bytes)
:return: An instance of :class:`soup_schema.schema.Schema` which has had it's selectors parsed from ``html``
:rtype: :class:`soup_schema.schema.Schema`
:raises: :class:`soup_schema.error.ValidationError` if there was a problem parsing a selector
(e.g. one was required but none was found)
"""
instance = cls() instance = cls()
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
for name, value in cls._get_selectors(): for name, value in cls._get_selectors():
@ -24,7 +56,7 @@ class Schema(object):
properties = [] properties = []
for name, _ in self.__class__._get_selectors(): for name, _ in self.__class__._get_selectors():
value = getattr(self, name, None) value = getattr(self, name, None)
properties.append('{name}={value}'.format(name=name, value=value))
properties.append('{name}={value}'.format(name=name, value=repr(value)))
return ( return (
'{name}({properties})' '{name}({properties})'
.format(name=self.__class__.__name__, properties=', '.join(properties)) .format(name=self.__class__.__name__, properties=', '.join(properties))


+ 163
- 0
soup_schema/selector.py View File

@ -4,12 +4,41 @@ from .error import ValidationError
class Selector(object): class Selector(object):
"""
Base selector class used for defining properties on a :class:`soup_schema.schema.Schema`
A selector is used to define how a property should be parsed from the HTML document.
:Example:
.. code:: python
class CustomSchema(Schema):
# Parse the `<title></title>` element from the document
title = Selector('title', required=True)
# ... define other selectors
"""
def __init__(self, selector, required=False, as_list=False): def __init__(self, selector, required=False, as_list=False):
"""
Constructor for defining a new :class:`soup_schema.selector.Selector`.
.. seealso:
`BeautifulSoup CSS Selectors <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors>`_
:param selector: The CSS selector to use for finding a given element in the HTML document.
:type selector: str
:param required: Whether or not an exception should be thrown if this selector could not be parsed.
:type required: bool
:param as_list: Whether this selector should be parsed as a list. Default behavior is to parse only the first
element matching the provided ``selector``
:type as_list: bool
"""
self.selector = selector self.selector = selector
self.as_list = as_list self.as_list = as_list
self.required = required self.required = required
def _get_value(self, elm): def _get_value(self, elm):
"""Internal method for parsing the value from a BeautifulSoup element"""
if elm is None: if elm is None:
return None return None
if 'content' in elm.attrs: if 'content' in elm.attrs:
@ -17,6 +46,16 @@ class Selector(object):
return elm.text return elm.text
def resolve(self, soup): def resolve(self, soup):
"""
Resolve the value for this selector from the provided HTML document (or BeautifulSoup element).
:param soup: HTML document content as a string or BeautifulSoup object to parse this selector from
:type soup: :class:`bs4.BeautifulSoup`, :class:`bs4.element.Tag`, str, or bytes
:returns: The parsed element value, will be a str if a single element, list if ``as_list is True``, or
else ``None`` if no matching element was found.
:rtype: str, list, None
:raises: :class:`soup_schema.error.ValidationError` if ``required is True`` and no matching element was found.
"""
if isinstance(soup, (str, bytes)): if isinstance(soup, (str, bytes)):
soup = BeautifulSoup(soup, 'html.parser') soup = BeautifulSoup(soup, 'html.parser')
value = None value = None
@ -35,31 +74,155 @@ class Selector(object):
class AttrSelector(Selector): class AttrSelector(Selector):
"""
Selector type which parses it's value from an element attribute
:Example:
.. code:: python
class CustomSchema(Schema):
# Parse the `href` attribute from all links in the HTML document
hrefs = AttrSelector('a', 'href', as_list=True)
# ... define other selectors
"""
def __init__(self, selector, attribute, *args, **kwargs): def __init__(self, selector, attribute, *args, **kwargs):
"""
Constructor for defining a new :class:`soup_schema.selector.AttrSelector`.
.. seealso:
`BeautifulSoup CSS Selectors <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors>`_
:param selector: The CSS selector to use for finding a given element in the HTML document.
:type selector: str
:param attribute: The name of the attribute to parse from the matching element
:type: attribute: str
:param required: Whether or not an exception should be thrown if this selector could not be parsed.
:type required: bool
:param as_list: Whether this selector should be parsed as a list. Default behavior is to parse only the first
element matching the provided ``selector``
:type as_list: bool
"""
super(AttrSelector, self).__init__(selector=selector, *args, **kwargs) super(AttrSelector, self).__init__(selector=selector, *args, **kwargs)
self.attribute = attribute self.attribute = attribute
def _get_value(self, elm): def _get_value(self, elm):
"""Internal method for parsing an attribute from an element"""
if elm is None: if elm is None:
return None return None
return elm.attrs.get(self.attribute) return elm.attrs.get(self.attribute)
class SchemaSelector(Selector): class SchemaSelector(Selector):
"""
Selector type which uses a :class:`soup_schema.schema.Schema` to parse it's value
:Example:
.. code:: python
example_html_doc = \"\"\"
<html>
<head></head>
<body>
<div class="review">
<div class="review__author">Author Name</div>
<div class="review__content">This review is awesome</div>
</div>
<div class="review">
<div class="review__author">Another reviewer</div>
<div class="review__content">This review is not as awesome as the last</div>
</div>
</body>
</html>
\"\"\"
class ReviewSchema(Schema):
author = Selector('.review__author', required=True)
review = Selector('.review__content', required=True)
class DocumentSchema(Schema):
# This selector will using `ReviewSchema` to parse each instance of `.review` in the document
reviews = SchemaSelector('.review', ReviewSchema, as_list=True)
"""
def __init__(self, selector, schema, *args, **kwargs): def __init__(self, selector, schema, *args, **kwargs):
"""
Constructor for defining a new :class:`soup_schema.selector.SchemaSelector`.
.. seealso:
`BeautifulSoup CSS Selectors <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors>`_
:param selector: The CSS selector to use for finding a given element in the HTML document.
:type selector: str
:param schema: The name of the attribute to parse from the matching element
:type: schema: :class:`soup_schema.schema.Schema`
:param required: Whether or not an exception should be thrown if this selector could not be parsed.
:type required: bool
:param as_list: Whether this selector should be parsed as a list. Default behavior is to parse only the first
element matching the provided ``selector``
:type as_list: bool
"""
super(AttrSelector, self).__init__(selector=selector, *args, **kwargs) super(AttrSelector, self).__init__(selector=selector, *args, **kwargs)
self.schema = schema self.schema = schema
def _get_value(self, elm): def _get_value(self, elm):
"""Internal method for parsing a Schema from an element"""
return self.schema.resolve(elm) return self.schema.resolve(elm)
class AnySelector(Selector): class AnySelector(Selector):
"""
Selector type which is used as a boolean "or" for parsing an elements value.
This selector type is useful when you want to be able to search multiple locations for a properties value.
:Example:
.. code:: python
example_html_doc = \"\"\"
<html>
<head>
<meta name="description" content="My description" />
<meta name="og:description" content="My description" />
</head>
<body></body>
</html
\"\"\"
class CustomSchema(Schema):
# - Try to parse the `<meta name="description" />` element
# - if that was not found, then try to parse the `<meta name="og:description" />` element
# - if that was also not found, then raise an exception (because of `required=True`)
description = AnySelector([
Selector('[name=description]'),
Selector('[name=og:description]'),
], required=True)
"""
def __init__(self, selectors, required=False): def __init__(self, selectors, required=False):
"""
Constructor for defining a new :class:`AnySelector`.
:param selectors: The :class:`soup_schema.selector.Selector`s to use when searching for this properties value
:type selectors: list of :class:`soup_schema.selector.Selector`
:param required: Whether or not an exception should be thrown if this selector could not be parsed.
:type required: bool
"""
self.selectors = selectors self.selectors = selectors
self.required = required self.required = required
def resolve(self, soup): def resolve(self, soup):
"""
Resolve the value for this selector from the provided HTML document (or BeautifulSoup element).
.. seealso: :meth:`soup_schema.selector.Selector.resolve`
:param soup: HTML document content as a string or BeautifulSoup object to parse this selector from
:type soup: :class:`bs4.BeautifulSoup`, :class:`bs4.element.Tag`, str, or bytes
:returns: The value of the first matches ``selectors`` from this selector.
:rtype: str, list, None
:raises: :class:`soup_schema.error.ValidationError` if ``required is True`` and no matching element was found.
"""
for selector in self.selectors: for selector in self.selectors:
try: try:
value = selector.resolve(soup) value = selector.resolve(soup)


Loading…
Cancel
Save