| @ -0,0 +1,4 @@ | |||||
| *.py[co] | |||||
| *.egg-info | |||||
| dist/ | |||||
| build/ | |||||
| @ -0,0 +1,4 @@ | |||||
| include README.* setup.py requirements.txt | |||||
| recursive-include soup_schema *.py | |||||
| global-exclude *.pyc | |||||
| global-exclude *.pyo | |||||
| @ -0,0 +1,2 @@ | |||||
| soup-schema | |||||
| ~~~~~~~~~~~ | |||||
| @ -0,0 +1 @@ | |||||
| beautifulsoup4==4.5.1 | |||||
| @ -0,0 +1,36 @@ | |||||
| #!/usr/bin/env python | |||||
| from setuptools import setup | |||||
| requirements = [] | |||||
| with open('./requirements.txt', 'r') as fp: | |||||
| requirements = [l.strip() for l in fp] | |||||
| long_description = '' | |||||
| with open('./README.rst', 'r') as fp: | |||||
| long_description = fp.read() | |||||
| setup( | |||||
| name='soup-schema', | |||||
| version='0.1.0', | |||||
| py_modules=[ | |||||
| 'soup_schema', | |||||
| ], | |||||
| install_requires=requirements, | |||||
| author='Brett Langdon', | |||||
| author_email='me@brett.is', | |||||
| classifiers=[ | |||||
| 'Development Status :: 4 - Beta', | |||||
| 'Intended Audience :: Developers', | |||||
| 'License :: OSI Approved :: MIT License', | |||||
| 'Programming Language :: Python', | |||||
| 'Programming Language :: Python :: 3', | |||||
| 'Topic :: Software Development', | |||||
| ], | |||||
| description='', | |||||
| license='MIT', | |||||
| long_description=long_description, | |||||
| keywords='beautifulsoup, soup, html, parser, schema', | |||||
| url='https://github.com/brettlangdon/soup_schema', | |||||
| ) | |||||
| @ -0,0 +1,11 @@ | |||||
| from .error import ValidationError | |||||
| from .schema import Schema | |||||
| from .selector import Selector, AnySelector, AttrSelector | |||||
| __all__ = [ | |||||
| 'AnySelector', | |||||
| 'AttrSelector', | |||||
| 'Schema', | |||||
| 'Selector', | |||||
| 'ValidationError', | |||||
| ] | |||||
| @ -0,0 +1,38 @@ | |||||
| from ..schema import Schema | |||||
| from ..selector import Selector, AnySelector, AttrSelector | |||||
| class RecipeSchema(Schema): | |||||
| # Publish data | |||||
| author = Selector('[itemprop=author]') | |||||
| categories = Selector('[itemprop=recipeCategory]', as_list=True) | |||||
| description = AnySelector([ | |||||
| Selector('[itemprop=description]'), | |||||
| Selector('[name=og:description]'), | |||||
| Selector('[name=description]') | |||||
| ], required=True) | |||||
| name = AnySelector([ | |||||
| Selector('[itemprop=name]'), | |||||
| Selector('[property=og:title]') | |||||
| ], required=True) | |||||
| recipe_yield = Selector('[itemprop=recipeYield]') | |||||
| # Recipe instructions | |||||
| ingredients = AnySelector([ | |||||
| Selector('[itemprop=recipeIngredient]', as_list=True), | |||||
| Selector('[itemprop=ingredients]', as_list=True), | |||||
| ], required=True) | |||||
| instructions = Selector('[itemprop=recipeInstructions]', as_list=True, required=True) | |||||
| # Cooking time | |||||
| cook_time = AttrSelector('[itemprop=cookTime]', 'datetime') | |||||
| prep_time = AttrSelector('[itemprop=prepTime]', 'datetime') | |||||
| total_time = AttrSelector('[itemprop=totalTime]', 'datetime') | |||||
| # Nutrition | |||||
| calories = Selector('[itemprop=calories]') | |||||
| carbohydrate_content = Selector('[itemprop=carbohydrateContent]') | |||||
| cholesterol_content = Selector('[itemprop=cholesterolContent]') | |||||
| fat_content = Selector('[itemprop=fatContent]') | |||||
| protein_content = Selector('[itemprop=proteinContent]') | |||||
| sodium_content = Selector('[itemprop=sodiumContent]') | |||||
| @ -0,0 +1,2 @@ | |||||
| class ValidationError(Exception): | |||||
| pass | |||||
| @ -0,0 +1,31 @@ | |||||
| from bs4 import BeautifulSoup | |||||
| from .selector import Selector | |||||
| class Schema(object): | |||||
| __version__ = 1 | |||||
| @classmethod | |||||
| def _get_selectors(cls): | |||||
| for name, value in cls.__dict__.items(): | |||||
| if isinstance(value, Selector): | |||||
| yield name, value | |||||
| @classmethod | |||||
| def parse(cls, html): | |||||
| instance = cls() | |||||
| soup = BeautifulSoup(html, 'html.parser') | |||||
| for name, value in cls._get_selectors(): | |||||
| setattr(instance, name, value.resolve(soup)) | |||||
| return instance | |||||
| def __repr__(self): | |||||
| properties = [] | |||||
| for name, _ in self.__class__._get_selectors(): | |||||
| value = getattr(self, name, None) | |||||
| properties.append('{name}={value}'.format(name=name, value=value)) | |||||
| return ( | |||||
| '{name}({properties})' | |||||
| .format(name=self.__class__.__name__, properties=', '.join(properties)) | |||||
| ) | |||||
| @ -0,0 +1,76 @@ | |||||
| from bs4 import BeautifulSoup | |||||
| from .error import ValidationError | |||||
| class Selector(object): | |||||
| def __init__(self, selector, required=False, as_list=False): | |||||
| self.selector = selector | |||||
| self.as_list = as_list | |||||
| self.required = required | |||||
| def _get_value(self, elm): | |||||
| if elm is None: | |||||
| return None | |||||
| if 'content' in elm.attrs: | |||||
| return elm.attrs['content'] | |||||
| return elm.text | |||||
| def resolve(self, soup): | |||||
| if isinstance(soup, (str, bytes)): | |||||
| soup = BeautifulSoup(soup, 'html.parser') | |||||
| value = None | |||||
| if self.as_list: | |||||
| value = [self._get_value(elm) for elm in soup.select(self.selector)] | |||||
| else: | |||||
| elm = soup.select_one(self.selector) | |||||
| value = self._get_value(elm) | |||||
| if not value and self.required: | |||||
| raise ValidationError( | |||||
| 'Expected at least 1 element matching selector "{selector}", none was found' | |||||
| .format(selector=self.selector) | |||||
| ) | |||||
| return value | |||||
| class AttrSelector(Selector): | |||||
| def __init__(self, selector, attribute, *args, **kwargs): | |||||
| super(AttrSelector, self).__init__(selector=selector, *args, **kwargs) | |||||
| self.attribute = attribute | |||||
| def _get_value(self, elm): | |||||
| if elm is None: | |||||
| return None | |||||
| return elm.attrs.get(self.attribute) | |||||
| class SchemaSelector(Selector): | |||||
| def __init__(self, selector, schema, *args, **kwargs): | |||||
| super(AttrSelector, self).__init__(selector=selector, *args, **kwargs) | |||||
| self.schema = schema | |||||
| def _get_value(self, elm): | |||||
| return self.schema.resolve(elm) | |||||
| class AnySelector(Selector): | |||||
| def __init__(self, selectors, required=False): | |||||
| self.selectors = selectors | |||||
| self.required = required | |||||
| def resolve(self, soup): | |||||
| for selector in self.selectors: | |||||
| try: | |||||
| value = selector.resolve(soup) | |||||
| if value: | |||||
| return value | |||||
| except ValidationError: | |||||
| # DEV: It is ok if one fails, we will try the next one | |||||
| pass | |||||
| if self.required: | |||||
| raise ValidationError( | |||||
| 'Expected at least 1 element matching selector "{selectors}", none was found' | |||||
| .format(selectors=self.selectors) | |||||
| ) | |||||