+
+
+ """
+
+ page = PageSchema.parse(html)
+ print(page)
+ # PageSchema(
+ # content='\nThis is my page content\n',
+ # description='This is my page description',
+ # stylesheets=['/dist/css/third-party.css', '/dist/css/style.css'],
+ # title='My page title'
+ # )
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..2b9ba89
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,225 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help
+help:
+ @echo "Please use \`make ' where is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " applehelp to make an Apple Help Book"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " epub3 to make an epub3"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " xml to make Docutils-native XML files"
+ @echo " pseudoxml to make pseudoxml-XML files for display purposes"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+ @echo " coverage to run coverage check of the documentation (if enabled)"
+ @echo " dummy to check syntax errors of document sources"
+
+.PHONY: clean
+clean:
+ rm -rf $(BUILDDIR)/*
+
+.PHONY: html
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+.PHONY: dirhtml
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+.PHONY: singlehtml
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+.PHONY: pickle
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+.PHONY: json
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+.PHONY: htmlhelp
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+.PHONY: qthelp
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/soup_schema.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/soup_schema.qhc"
+
+.PHONY: applehelp
+applehelp:
+ $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+ @echo
+ @echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+ @echo "N.B. You won't be able to view it unless you put it in" \
+ "~/Library/Documentation/Help or install it in your application" \
+ "bundle."
+
+.PHONY: devhelp
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/soup_schema"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/soup_schema"
+ @echo "# devhelp"
+
+.PHONY: epub
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+.PHONY: epub3
+epub3:
+ $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
+ @echo
+ @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
+
+.PHONY: latex
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+.PHONY: latexpdf
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: latexpdfja
+latexpdfja:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through platex and dvipdfmx..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: text
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+.PHONY: man
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+.PHONY: texinfo
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+.PHONY: info
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+.PHONY: gettext
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+.PHONY: changes
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+.PHONY: linkcheck
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+.PHONY: doctest
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
+
+.PHONY: coverage
+coverage:
+ $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+ @echo "Testing of coverage in the sources finished, look at the " \
+ "results in $(BUILDDIR)/coverage/python.txt."
+
+.PHONY: xml
+xml:
+ $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+ @echo
+ @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+.PHONY: pseudoxml
+pseudoxml:
+ $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+ @echo
+ @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
+
+.PHONY: dummy
+dummy:
+ $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
+ @echo
+ @echo "Build finished. Dummy builder generates no files."
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..1a57872
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,428 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# soup_schema documentation build configuration file, created by
+# sphinx-quickstart on Sat Sep 10 08:49:41 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.todo',
+ 'sphinx.ext.viewcode',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'soup_schema'
+copyright = '2016, Brett Langdon '
+author = 'Brett Langdon'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.1.0'
+# The full version, including alpha/beta/rc tags.
+release = '0.1.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = 'en'
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#
+# today = ''
+#
+# Else, today_fmt is used as the format for a strftime call.
+#
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+
+# The name for this set of Sphinx documents.
+# " v documentation" by default.
+#
+# html_title = 'soup_schema v'
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#
+# html_logo = None
+
+# The name of an image file (relative to this directory) to use as a favicon of
+# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#
+# html_extra_path = []
+
+# If not None, a 'Last updated on:' timestamp is inserted at every page
+# bottom, using the given strftime format.
+# The empty string is equivalent to '%b %d, %Y'.
+#
+# html_last_updated_fmt = None
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+#
+# html_domain_indices = True
+
+# If false, no index is generated.
+#
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
+# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh'
+#
+# html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# 'ja' uses this config value.
+# 'zh' user can custom change `jieba` dictionary path.
+#
+# html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#
+# html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'soup_schemadoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+ # The paper size ('letterpaper' or 'a4paper').
+ #
+ # 'papersize': 'letterpaper',
+
+ # The font size ('10pt', '11pt' or '12pt').
+ #
+ # 'pointsize': '10pt',
+
+ # Additional stuff for the LaTeX preamble.
+ #
+ # 'preamble': '',
+
+ # Latex figure (float) alignment
+ #
+ # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ (master_doc, 'soup_schema.tex', 'soup\\_schema Documentation',
+ 'Author', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+#
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#
+# latex_appendices = []
+
+# It false, will not define \strong, \code, itleref, \crossref ... but only
+# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
+# packages.
+#
+# latex_keep_old_macro_names = True
+
+# If false, no module index is generated.
+#
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ (master_doc, 'soup_schema', 'soup_schema Documentation',
+ [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ (master_doc, 'soup_schema', 'soup_schema Documentation',
+ author, 'soup_schema', 'One line description of project.',
+ 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+#
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#
+# texinfo_no_detailmenu = False
+
+
+# -- Options for Epub output ----------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+epub_author = author
+epub_publisher = author
+epub_copyright = copyright
+
+# The basename for the epub file. It defaults to the project name.
+# epub_basename = project
+
+# The HTML theme for the epub output. Since the default themes are not
+# optimized for small screen space, using the same theme for HTML and epub
+# output is usually not wise. This defaults to 'epub', a theme designed to save
+# visual space.
+#
+# epub_theme = 'epub'
+
+# The language of the text. It defaults to the language option
+# or 'en' if the language is not set.
+#
+# epub_language = ''
+
+# The scheme of the identifier. Typical schemes are ISBN or URL.
+# epub_scheme = ''
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A tuple containing the cover image and cover page html template filenames.
+#
+# epub_cover = ()
+
+# A sequence of (type, uri, title) tuples for the guide element of content.opf.
+#
+# epub_guide = ()
+
+# HTML files that should be inserted before the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#
+# epub_pre_files = []
+
+# HTML files that should be inserted after the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#
+# epub_post_files = []
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# The depth of the table of contents in toc.ncx.
+#
+# epub_tocdepth = 3
+
+# Allow duplicate toc entries.
+#
+# epub_tocdup = True
+
+# Choose between 'default' and 'includehidden'.
+#
+# epub_tocscope = 'default'
+
+# Fix unsupported image types using the Pillow.
+#
+# epub_fix_images = False
+
+# Scale large images.
+#
+# epub_max_image_width = 0
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# epub_show_urls = 'inline'
+
+# If false, no index is generated.
+#
+# epub_use_index = True
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..6484c5d
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,56 @@
+.. soup_schema documentation master file, created by
+ sphinx-quickstart on Sat Sep 10 08:49:41 2016.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. toctree::
+ :maxdepth: 4
+
+.. include:: ../README.rst
+
+API documentation
+-----------------
+
+soup_schema
+***********
+
+All submodule classes are exposed and importable directly from ``soup_schema``.
+
+- ``soup_schema.Schema`` - :class:`soup_schema.schema.Schema`
+- ``soup_schema.Selector`` - :class:`soup_schema.selector.Selector`
+- ``soup_schema.AnySelector`` - :class:`soup_schema.selector.AnySelector`
+- ``soup_schema.AttrSelector`` - :class:`soup_schema.selector.AttrSelector`
+- ``soup_schema.SchemaSelector`` - :class:`soup_schema.selector.SchemaSelector`
+- ``soup_schema.ValidationError`` - :class:`soup_schema.error.ValidationError`
+
+
+Schema
+~~~~~~
+
+.. automodule:: soup_schema.schema
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Selector
+~~~~~~~~
+
+.. automodule:: soup_schema.selector
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Error
+~~~~~
+
+.. automodule:: soup_schema.error
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/soup_schema/error.py b/soup_schema/error.py
index 15e676c..616272a 100644
--- a/soup_schema/error.py
+++ b/soup_schema/error.py
@@ -1,2 +1,3 @@
class ValidationError(Exception):
+ """soup_schema error raised when a :class:`Selector` could not be parsed"""
pass
diff --git a/soup_schema/schema.py b/soup_schema/schema.py
index b4a1841..c28ba6b 100644
--- a/soup_schema/schema.py
+++ b/soup_schema/schema.py
@@ -4,16 +4,48 @@ from .selector import Selector
class Schema(object):
- __version__ = 1
+ """
+ Base class to inherit from for defining custom HTML schemas
+ :Example:
+
+ .. code:: python
+
+ class CustomSchema(Schema):
+ # Parse the `` element from the document
+ title = Selector('title', required=True)
+ # ... define other selectors here
+
+ html = \"\"\"
+
+
+ My page title
+
+
+
+
+ \"\"\"
+ parsed = CustomSchema.parse(html)
+ """
@classmethod
def _get_selectors(cls):
+ """Helper to get all the selectors defined on this Schema"""
for name, value in cls.__dict__.items():
if isinstance(value, Selector):
yield name, value
@classmethod
def parse(cls, html):
+ """
+ Parse the provided html document into this schema.
+
+ :param html: The text content of the HTML document to parse
+ :type html: (str, bytes)
+ :return: An instance of :class:`soup_schema.schema.Schema` which has had it's selectors parsed from ``html``
+ :rtype: :class:`soup_schema.schema.Schema`
+ :raises: :class:`soup_schema.error.ValidationError` if there was a problem parsing a selector
+ (e.g. one was required but none was found)
+ """
instance = cls()
soup = BeautifulSoup(html, 'html.parser')
for name, value in cls._get_selectors():
@@ -24,7 +56,7 @@ class Schema(object):
properties = []
for name, _ in self.__class__._get_selectors():
value = getattr(self, name, None)
- properties.append('{name}={value}'.format(name=name, value=value))
+ properties.append('{name}={value}'.format(name=name, value=repr(value)))
return (
'{name}({properties})'
.format(name=self.__class__.__name__, properties=', '.join(properties))
diff --git a/soup_schema/selector.py b/soup_schema/selector.py
index 4ba2314..0228ff8 100644
--- a/soup_schema/selector.py
+++ b/soup_schema/selector.py
@@ -4,12 +4,41 @@ from .error import ValidationError
class Selector(object):
+ """
+ Base selector class used for defining properties on a :class:`soup_schema.schema.Schema`
+
+ A selector is used to define how a property should be parsed from the HTML document.
+
+ :Example:
+
+ .. code:: python
+
+ class CustomSchema(Schema):
+ # Parse the `` element from the document
+ title = Selector('title', required=True)
+ # ... define other selectors
+ """
def __init__(self, selector, required=False, as_list=False):
+ """
+ Constructor for defining a new :class:`soup_schema.selector.Selector`.
+
+ .. seealso:
+ `BeautifulSoup CSS Selectors `_
+
+ :param selector: The CSS selector to use for finding a given element in the HTML document.
+ :type selector: str
+ :param required: Whether or not an exception should be thrown if this selector could not be parsed.
+ :type required: bool
+ :param as_list: Whether this selector should be parsed as a list. Default behavior is to parse only the first
+ element matching the provided ``selector``
+ :type as_list: bool
+ """
self.selector = selector
self.as_list = as_list
self.required = required
def _get_value(self, elm):
+ """Internal method for parsing the value from a BeautifulSoup element"""
if elm is None:
return None
if 'content' in elm.attrs:
@@ -17,6 +46,16 @@ class Selector(object):
return elm.text
def resolve(self, soup):
+ """
+ Resolve the value for this selector from the provided HTML document (or BeautifulSoup element).
+
+ :param soup: HTML document content as a string or BeautifulSoup object to parse this selector from
+ :type soup: :class:`bs4.BeautifulSoup`, :class:`bs4.element.Tag`, str, or bytes
+ :returns: The parsed element value, will be a str if a single element, list if ``as_list is True``, or
+ else ``None`` if no matching element was found.
+ :rtype: str, list, None
+ :raises: :class:`soup_schema.error.ValidationError` if ``required is True`` and no matching element was found.
+ """
if isinstance(soup, (str, bytes)):
soup = BeautifulSoup(soup, 'html.parser')
value = None
@@ -35,31 +74,155 @@ class Selector(object):
class AttrSelector(Selector):
+ """
+ Selector type which parses it's value from an element attribute
+
+ :Example:
+
+ .. code:: python
+
+ class CustomSchema(Schema):
+ # Parse the `href` attribute from all links in the HTML document
+ hrefs = AttrSelector('a', 'href', as_list=True)
+ # ... define other selectors
+ """
def __init__(self, selector, attribute, *args, **kwargs):
+ """
+ Constructor for defining a new :class:`soup_schema.selector.AttrSelector`.
+
+ .. seealso:
+ `BeautifulSoup CSS Selectors `_
+
+ :param selector: The CSS selector to use for finding a given element in the HTML document.
+ :type selector: str
+ :param attribute: The name of the attribute to parse from the matching element
+ :type: attribute: str
+ :param required: Whether or not an exception should be thrown if this selector could not be parsed.
+ :type required: bool
+ :param as_list: Whether this selector should be parsed as a list. Default behavior is to parse only the first
+ element matching the provided ``selector``
+ :type as_list: bool
+ """
super(AttrSelector, self).__init__(selector=selector, *args, **kwargs)
self.attribute = attribute
def _get_value(self, elm):
+ """Internal method for parsing an attribute from an element"""
if elm is None:
return None
return elm.attrs.get(self.attribute)
class SchemaSelector(Selector):
+ """
+ Selector type which uses a :class:`soup_schema.schema.Schema` to parse it's value
+
+ :Example:
+
+ .. code:: python
+
+ example_html_doc = \"\"\"
+
+
+
+
+
Author Name
+
This review is awesome
+
+
+
Another reviewer
+
This review is not as awesome as the last
+
+
+
+ \"\"\"
+
+ class ReviewSchema(Schema):
+ author = Selector('.review__author', required=True)
+ review = Selector('.review__content', required=True)
+
+ class DocumentSchema(Schema):
+ # This selector will using `ReviewSchema` to parse each instance of `.review` in the document
+ reviews = SchemaSelector('.review', ReviewSchema, as_list=True)
+ """
def __init__(self, selector, schema, *args, **kwargs):
+ """
+ Constructor for defining a new :class:`soup_schema.selector.SchemaSelector`.
+
+ .. seealso:
+ `BeautifulSoup CSS Selectors `_
+
+ :param selector: The CSS selector to use for finding a given element in the HTML document.
+ :type selector: str
+ :param schema: The name of the attribute to parse from the matching element
+ :type: schema: :class:`soup_schema.schema.Schema`
+ :param required: Whether or not an exception should be thrown if this selector could not be parsed.
+ :type required: bool
+ :param as_list: Whether this selector should be parsed as a list. Default behavior is to parse only the first
+ element matching the provided ``selector``
+ :type as_list: bool
+ """
super(AttrSelector, self).__init__(selector=selector, *args, **kwargs)
self.schema = schema
def _get_value(self, elm):
+ """Internal method for parsing a Schema from an element"""
return self.schema.resolve(elm)
class AnySelector(Selector):
+ """
+ Selector type which is used as a boolean "or" for parsing an elements value.
+
+ This selector type is useful when you want to be able to search multiple locations for a properties value.
+
+ :Example:
+
+ .. code:: python
+
+ example_html_doc = \"\"\"
+
+
+
+
+
+
+ ` element
+ # - if that was not found, then try to parse the `` element
+ # - if that was also not found, then raise an exception (because of `required=True`)
+ description = AnySelector([
+ Selector('[name=description]'),
+ Selector('[name=og:description]'),
+ ], required=True)
+ """
def __init__(self, selectors, required=False):
+ """
+ Constructor for defining a new :class:`AnySelector`.
+
+ :param selectors: The :class:`soup_schema.selector.Selector`s to use when searching for this properties value
+ :type selectors: list of :class:`soup_schema.selector.Selector`
+ :param required: Whether or not an exception should be thrown if this selector could not be parsed.
+ :type required: bool
+ """
self.selectors = selectors
self.required = required
def resolve(self, soup):
+ """
+ Resolve the value for this selector from the provided HTML document (or BeautifulSoup element).
+
+ .. seealso: :meth:`soup_schema.selector.Selector.resolve`
+
+ :param soup: HTML document content as a string or BeautifulSoup object to parse this selector from
+ :type soup: :class:`bs4.BeautifulSoup`, :class:`bs4.element.Tag`, str, or bytes
+ :returns: The value of the first matches ``selectors`` from this selector.
+ :rtype: str, list, None
+ :raises: :class:`soup_schema.error.ValidationError` if ``required is True`` and no matching element was found.
+ """
for selector in self.selectors:
try:
value = selector.resolve(soup)