commit faea698c1c05c2fbfc78b67781a8218e103b902b Author: brettlangdon Date: Sat Sep 10 07:51:57 2016 -0400 Initial prototype diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..466974e --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.py[co] +*.egg-info +dist/ +build/ diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..c1e5ca4 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include README.* setup.py requirements.txt +recursive-include soup_schema *.py +global-exclude *.pyc +global-exclude *.pyo diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..aa5d1a0 --- /dev/null +++ b/README.rst @@ -0,0 +1,2 @@ +soup-schema +~~~~~~~~~~~ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8f63371 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +beautifulsoup4==4.5.1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..cf0e026 --- /dev/null +++ b/setup.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +from setuptools import setup + +requirements = [] +with open('./requirements.txt', 'r') as fp: + requirements = [l.strip() for l in fp] + +long_description = '' +with open('./README.rst', 'r') as fp: + long_description = fp.read() + + +setup( + name='soup-schema', + version='0.1.0', + py_modules=[ + 'soup_schema', + ], + install_requires=requirements, + + author='Brett Langdon', + author_email='me@brett.is', + classifiers=[ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python', + 'Programming Language :: Python :: 3', + 'Topic :: Software Development', + ], + description='', + license='MIT', + long_description=long_description, + keywords='beautifulsoup, soup, html, parser, schema', + url='https://github.com/brettlangdon/soup_schema', +) diff --git a/soup_schema/__init__.py b/soup_schema/__init__.py new file mode 100644 index 0000000..c200d79 --- /dev/null +++ b/soup_schema/__init__.py @@ -0,0 +1,11 @@ +from .error import ValidationError +from .schema import Schema +from .selector import Selector, AnySelector, AttrSelector + +__all__ = [ + 'AnySelector', + 'AttrSelector', + 'Schema', + 'Selector', + 'ValidationError', +] diff --git a/soup_schema/contrib/__init__.py b/soup_schema/contrib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/soup_schema/contrib/recipe.py b/soup_schema/contrib/recipe.py new file mode 100644 index 0000000..c6d641f --- /dev/null +++ b/soup_schema/contrib/recipe.py @@ -0,0 +1,38 @@ +from ..schema import Schema +from ..selector import Selector, AnySelector, AttrSelector + + +class RecipeSchema(Schema): + # Publish data + author = Selector('[itemprop=author]') + categories = Selector('[itemprop=recipeCategory]', as_list=True) + description = AnySelector([ + Selector('[itemprop=description]'), + Selector('[name=og:description]'), + Selector('[name=description]') + ], required=True) + name = AnySelector([ + Selector('[itemprop=name]'), + Selector('[property=og:title]') + ], required=True) + recipe_yield = Selector('[itemprop=recipeYield]') + + # Recipe instructions + ingredients = AnySelector([ + Selector('[itemprop=recipeIngredient]', as_list=True), + Selector('[itemprop=ingredients]', as_list=True), + ], required=True) + instructions = Selector('[itemprop=recipeInstructions]', as_list=True, required=True) + + # Cooking time + cook_time = AttrSelector('[itemprop=cookTime]', 'datetime') + prep_time = AttrSelector('[itemprop=prepTime]', 'datetime') + total_time = AttrSelector('[itemprop=totalTime]', 'datetime') + + # Nutrition + calories = Selector('[itemprop=calories]') + carbohydrate_content = Selector('[itemprop=carbohydrateContent]') + cholesterol_content = Selector('[itemprop=cholesterolContent]') + fat_content = Selector('[itemprop=fatContent]') + protein_content = Selector('[itemprop=proteinContent]') + sodium_content = Selector('[itemprop=sodiumContent]') diff --git a/soup_schema/error.py b/soup_schema/error.py new file mode 100644 index 0000000..15e676c --- /dev/null +++ b/soup_schema/error.py @@ -0,0 +1,2 @@ +class ValidationError(Exception): + pass diff --git a/soup_schema/schema.py b/soup_schema/schema.py new file mode 100644 index 0000000..b4a1841 --- /dev/null +++ b/soup_schema/schema.py @@ -0,0 +1,31 @@ +from bs4 import BeautifulSoup + +from .selector import Selector + + +class Schema(object): + __version__ = 1 + + @classmethod + def _get_selectors(cls): + for name, value in cls.__dict__.items(): + if isinstance(value, Selector): + yield name, value + + @classmethod + def parse(cls, html): + instance = cls() + soup = BeautifulSoup(html, 'html.parser') + for name, value in cls._get_selectors(): + setattr(instance, name, value.resolve(soup)) + return instance + + def __repr__(self): + properties = [] + for name, _ in self.__class__._get_selectors(): + value = getattr(self, name, None) + properties.append('{name}={value}'.format(name=name, value=value)) + return ( + '{name}({properties})' + .format(name=self.__class__.__name__, properties=', '.join(properties)) + ) diff --git a/soup_schema/selector.py b/soup_schema/selector.py new file mode 100644 index 0000000..4ba2314 --- /dev/null +++ b/soup_schema/selector.py @@ -0,0 +1,76 @@ +from bs4 import BeautifulSoup + +from .error import ValidationError + + +class Selector(object): + def __init__(self, selector, required=False, as_list=False): + self.selector = selector + self.as_list = as_list + self.required = required + + def _get_value(self, elm): + if elm is None: + return None + if 'content' in elm.attrs: + return elm.attrs['content'] + return elm.text + + def resolve(self, soup): + if isinstance(soup, (str, bytes)): + soup = BeautifulSoup(soup, 'html.parser') + value = None + if self.as_list: + value = [self._get_value(elm) for elm in soup.select(self.selector)] + else: + elm = soup.select_one(self.selector) + value = self._get_value(elm) + + if not value and self.required: + raise ValidationError( + 'Expected at least 1 element matching selector "{selector}", none was found' + .format(selector=self.selector) + ) + return value + + +class AttrSelector(Selector): + def __init__(self, selector, attribute, *args, **kwargs): + super(AttrSelector, self).__init__(selector=selector, *args, **kwargs) + self.attribute = attribute + + def _get_value(self, elm): + if elm is None: + return None + return elm.attrs.get(self.attribute) + + +class SchemaSelector(Selector): + def __init__(self, selector, schema, *args, **kwargs): + super(AttrSelector, self).__init__(selector=selector, *args, **kwargs) + self.schema = schema + + def _get_value(self, elm): + return self.schema.resolve(elm) + + +class AnySelector(Selector): + def __init__(self, selectors, required=False): + self.selectors = selectors + self.required = required + + def resolve(self, soup): + for selector in self.selectors: + try: + value = selector.resolve(soup) + if value: + return value + except ValidationError: + # DEV: It is ok if one fails, we will try the next one + pass + + if self.required: + raise ValidationError( + 'Expected at least 1 element matching selector "{selectors}", none was found' + .format(selectors=self.selectors) + ) diff --git a/soup_schema/test/__init__.py b/soup_schema/test/__init__.py new file mode 100644 index 0000000..e69de29