brettlangdon
/
soup-schema
mirror of https://github.com/brettlangdon/soup-schema.git


								from bs4 import BeautifulSoup


								from .error import ValidationError


								class Selector(object):

								    """

								    Base selector class used for defining properties on a :class:`soup_schema.schema.Schema`


								    A selector is used to define how a property should be parsed from the HTML document.


								    :Example:


								    .. code:: python


								        class CustomSchema(Schema):

								            # Parse the `<title></title>` element from the document

								            title = Selector('title', required=True)

								            # ... define other selectors

								    """

								    def __init__(self, selector, required=False, as_list=False):

								        """

								        Constructor for defining a new :class:`soup_schema.selector.Selector`.


								        .. seealso:

								          `BeautifulSoup CSS Selectors <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors>`_


								        :param selector: The CSS selector to use for finding a given element in the HTML document.

								        :type selector: str

								        :param required: Whether or not an exception should be thrown if this selector could not be parsed.

								        :type required: bool

								        :param as_list: Whether this selector should be parsed as a list. Default behavior is to parse only the first

								          element matching the provided ``selector``

								        :type as_list: bool

								        """

								        self.selector = selector

								        self.as_list = as_list

								        self.required = required


								    def _get_value(self, elm):

								        """Internal method for parsing the value from a BeautifulSoup element"""

								        if elm is None:

								            return None

								        if 'content' in elm.attrs:

								            return elm.attrs['content']

								        return elm.text


								    def resolve(self, soup):

								        """

								        Resolve the value for this selector from the provided HTML document (or BeautifulSoup element).


								        :param soup: HTML document content as a string or BeautifulSoup object to parse this selector from

								        :type soup: :class:`bs4.BeautifulSoup`, :class:`bs4.element.Tag`, str, or bytes

								        :returns: The parsed element value, will be a str if a single element, list if ``as_list is True``, or

								          else ``None`` if no matching element was found.

								        :rtype: str, list, None

								        :raises: :class:`soup_schema.error.ValidationError` if ``required is True`` and no matching element was found.

								        """

								        if isinstance(soup, (str, bytes)):

								            soup = BeautifulSoup(soup, 'html.parser')

								        value = None

								        if self.as_list:

								            value = [self._get_value(elm) for elm in soup.select(self.selector)]

								        else:

								            elm = soup.select_one(self.selector)

								            value = self._get_value(elm)


								        if not value and self.required:

								            raise ValidationError(

								                'Expected at least 1 element matching selector "{selector}", none was found'

								                .format(selector=self.selector)

								            )

								        return value


								class AttrSelector(Selector):

								    """

								    Selector type which parses it's value from an element attribute


								    :Example:


								    .. code:: python


								        class CustomSchema(Schema):

								            # Parse the `href` attribute from all links in the HTML document

								            hrefs = AttrSelector('a', 'href', as_list=True)

								            # ... define other selectors

								    """

								    def __init__(self, selector, attribute, *args, **kwargs):

								        """

								        Constructor for defining a new :class:`soup_schema.selector.AttrSelector`.


								        .. seealso:

								          `BeautifulSoup CSS Selectors <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors>`_


								        :param selector: The CSS selector to use for finding a given element in the HTML document.

								        :type selector: str

								        :param attribute: The name of the attribute to parse from the matching element

								        :type: attribute: str

								        :param required: Whether or not an exception should be thrown if this selector could not be parsed.

								        :type required: bool

								        :param as_list: Whether this selector should be parsed as a list. Default behavior is to parse only the first

								          element matching the provided ``selector``

								        :type as_list: bool

								        """

								        super(AttrSelector, self).__init__(selector=selector, *args, **kwargs)

								        self.attribute = attribute


								    def _get_value(self, elm):

								        """Internal method for parsing an attribute from an element"""

								        if elm is None:

								            return None

								        return elm.attrs.get(self.attribute)


								class SchemaSelector(Selector):

								    """

								    Selector type which uses a :class:`soup_schema.schema.Schema` to parse it's value


								    :Example:


								    .. code:: python


								        example_html_doc = \"\"\"

								        <html>

								          <head></head>

								          <body>

								            <div class="review">

								              <div class="review__author">Author Name</div>

								              <div class="review__content">This review is awesome</div>

								            </div>

								            <div class="review">

								              <div class="review__author">Another reviewer</div>

								              <div class="review__content">This review is not as awesome as the last</div>

								            </div>

								          </body>

								        </html>

								        \"\"\"


								        class ReviewSchema(Schema):

								            author = Selector('.review__author', required=True)

								            review = Selector('.review__content', required=True)


								        class DocumentSchema(Schema):

								            # This selector will using `ReviewSchema` to parse each instance of `.review` in the document

								            reviews = SchemaSelector('.review', ReviewSchema, as_list=True)

								    """

								    def __init__(self, selector, schema, *args, **kwargs):

								        """

								        Constructor for defining a new :class:`soup_schema.selector.SchemaSelector`.


								        .. seealso:

								          `BeautifulSoup CSS Selectors <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors>`_


								        :param selector: The CSS selector to use for finding a given element in the HTML document.

								        :type selector: str

								        :param schema: The name of the attribute to parse from the matching element

								        :type: schema: :class:`soup_schema.schema.Schema`

								        :param required: Whether or not an exception should be thrown if this selector could not be parsed.

								        :type required: bool

								        :param as_list: Whether this selector should be parsed as a list. Default behavior is to parse only the first

								          element matching the provided ``selector``

								        :type as_list: bool

								        """

								        super(AttrSelector, self).__init__(selector=selector, *args, **kwargs)

								        self.schema = schema


								    def _get_value(self, elm):

								        """Internal method for parsing a Schema from an element"""

								        return self.schema.resolve(elm)


								class AnySelector(Selector):

								    """

								    Selector type which is used as a boolean "or" for parsing an elements value.


								    This selector type is useful when you want to be able to search multiple locations for a properties value.


								    :Example:


								    .. code:: python


								        example_html_doc = \"\"\"

								        <html>

								          <head>

								            <meta name="description" content="My description" />

								            <meta name="og:description" content="My description" />

								          </head>

								          <body></body>

								        </html

								        \"\"\"


								        class CustomSchema(Schema):

								            # - Try to parse the `<meta name="description" />` element

								            # - if that was not found, then try to parse the `<meta name="og:description" />` element

								            # - if that was also not found, then raise an exception (because of `required=True`)

								            description = AnySelector([

								                Selector('[name=description]'),

								                Selector('[name=og:description]'),

								            ], required=True)

								    """

								    def __init__(self, selectors, required=False):

								        """

								        Constructor for defining a new :class:`AnySelector`.


								        :param selectors: The :class:`soup_schema.selector.Selector`s to use when searching for this properties value

								        :type selectors: list of :class:`soup_schema.selector.Selector`

								        :param required: Whether or not an exception should be thrown if this selector could not be parsed.

								        :type required: bool

								        """

								        self.selectors = selectors

								        self.required = required


								    def resolve(self, soup):

								        """

								        Resolve the value for this selector from the provided HTML document (or BeautifulSoup element).


								        .. seealso: :meth:`soup_schema.selector.Selector.resolve`


								        :param soup: HTML document content as a string or BeautifulSoup object to parse this selector from

								        :type soup: :class:`bs4.BeautifulSoup`, :class:`bs4.element.Tag`, str, or bytes

								        :returns: The value of the first matches ``selectors`` from this selector.

								        :rtype: str, list, None

								        :raises: :class:`soup_schema.error.ValidationError` if ``required is True`` and no matching element was found.

								        """

								        for selector in self.selectors:

								            try:

								                value = selector.resolve(soup)

								                if value:

								                    return value

								            except ValidationError:

								                # DEV: It is ok if one fails, we will try the next one

								                pass


								        if self.required:

								            raise ValidationError(

								                'Expected at least 1 element matching selector "{selectors}", none was found'

								                .format(selectors=self.selectors)

								            )