You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

63 lines
2.0 KiB

from bs4 import BeautifulSoup
from .selector import Selector
class Schema(object):
"""
Base class to inherit from for defining custom HTML schemas
:Example:
.. code:: python
class CustomSchema(Schema):
# Parse the `<title></title>` element from the document
title = Selector('title', required=True)
# ... define other selectors here
html = \"\"\"
<html>
<head>
<title>My page title</title>
</head>
<body>
</body>
</html>
\"\"\"
parsed = CustomSchema.parse(html)
"""
@classmethod
def _get_selectors(cls):
"""Helper to get all the selectors defined on this Schema"""
for name, value in cls.__dict__.items():
if isinstance(value, Selector):
yield name, value
@classmethod
def parse(cls, html):
"""
Parse the provided html document into this schema.
:param html: The text content of the HTML document to parse
:type html: (str, bytes)
:return: An instance of :class:`soup_schema.schema.Schema` which has had it's selectors parsed from ``html``
:rtype: :class:`soup_schema.schema.Schema`
:raises: :class:`soup_schema.error.ValidationError` if there was a problem parsing a selector
(e.g. one was required but none was found)
"""
instance = cls()
soup = BeautifulSoup(html, 'html.parser')
for name, value in cls._get_selectors():
setattr(instance, name, value.resolve(soup))
return instance
def __repr__(self):
properties = []
for name, _ in self.__class__._get_selectors():
value = getattr(self, name, None)
properties.append('{name}={value}'.format(name=name, value=repr(value)))
return (
'{name}({properties})'
.format(name=self.__class__.__name__, properties=', '.join(properties))
)