from bs4 import BeautifulSoup from .selector import Selector class Schema(object): """ Base class to inherit from for defining custom HTML schemas :Example: .. code:: python class CustomSchema(Schema): # Parse the `` element from the document title = Selector('title', required=True) # ... define other selectors here html = \"\"\" My page title \"\"\" parsed = CustomSchema.parse(html) """ @classmethod def _get_selectors(cls): """Helper to get all the selectors defined on this Schema""" for name, value in cls.__dict__.items(): if isinstance(value, Selector): yield name, value @classmethod def parse(cls, html): """ Parse the provided html document into this schema. :param html: The text content of the HTML document to parse :type html: (str, bytes) :return: An instance of :class:`soup_schema.schema.Schema` which has had it's selectors parsed from ``html`` :rtype: :class:`soup_schema.schema.Schema` :raises: :class:`soup_schema.error.ValidationError` if there was a problem parsing a selector (e.g. one was required but none was found) """ instance = cls() soup = BeautifulSoup(html, 'html.parser') for name, value in cls._get_selectors(): setattr(instance, name, value.resolve(soup)) return instance def __repr__(self): properties = [] for name, _ in self.__class__._get_selectors(): value = getattr(self, name, None) properties.append('{name}={value}'.format(name=name, value=repr(value))) return ( '{name}({properties})' .format(name=self.__class__.__name__, properties=', '.join(properties)) )