Browse Source

Initial prototype

master
Brett Langdon 9 years ago
commit
faea698c1c
No known key found for this signature in database GPG Key ID: A2ECAB73CE12147F
12 changed files with 205 additions and 0 deletions
  1. +4
    -0
      .gitignore
  2. +4
    -0
      MANIFEST.in
  3. +2
    -0
      README.rst
  4. +1
    -0
      requirements.txt
  5. +36
    -0
      setup.py
  6. +11
    -0
      soup_schema/__init__.py
  7. +0
    -0
      soup_schema/contrib/__init__.py
  8. +38
    -0
      soup_schema/contrib/recipe.py
  9. +2
    -0
      soup_schema/error.py
  10. +31
    -0
      soup_schema/schema.py
  11. +76
    -0
      soup_schema/selector.py
  12. +0
    -0
      soup_schema/test/__init__.py

+ 4
- 0
.gitignore View File

@ -0,0 +1,4 @@
*.py[co]
*.egg-info
dist/
build/

+ 4
- 0
MANIFEST.in View File

@ -0,0 +1,4 @@
include README.* setup.py requirements.txt
recursive-include soup_schema *.py
global-exclude *.pyc
global-exclude *.pyo

+ 2
- 0
README.rst View File

@ -0,0 +1,2 @@
soup-schema
~~~~~~~~~~~

+ 1
- 0
requirements.txt View File

@ -0,0 +1 @@
beautifulsoup4==4.5.1

+ 36
- 0
setup.py View File

@ -0,0 +1,36 @@
#!/usr/bin/env python
from setuptools import setup
requirements = []
with open('./requirements.txt', 'r') as fp:
requirements = [l.strip() for l in fp]
long_description = ''
with open('./README.rst', 'r') as fp:
long_description = fp.read()
setup(
name='soup-schema',
version='0.1.0',
py_modules=[
'soup_schema',
],
install_requires=requirements,
author='Brett Langdon',
author_email='me@brett.is',
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Topic :: Software Development',
],
description='',
license='MIT',
long_description=long_description,
keywords='beautifulsoup, soup, html, parser, schema',
url='https://github.com/brettlangdon/soup_schema',
)

+ 11
- 0
soup_schema/__init__.py View File

@ -0,0 +1,11 @@
from .error import ValidationError
from .schema import Schema
from .selector import Selector, AnySelector, AttrSelector
__all__ = [
'AnySelector',
'AttrSelector',
'Schema',
'Selector',
'ValidationError',
]

+ 0
- 0
soup_schema/contrib/__init__.py View File


+ 38
- 0
soup_schema/contrib/recipe.py View File

@ -0,0 +1,38 @@
from ..schema import Schema
from ..selector import Selector, AnySelector, AttrSelector
class RecipeSchema(Schema):
# Publish data
author = Selector('[itemprop=author]')
categories = Selector('[itemprop=recipeCategory]', as_list=True)
description = AnySelector([
Selector('[itemprop=description]'),
Selector('[name=og:description]'),
Selector('[name=description]')
], required=True)
name = AnySelector([
Selector('[itemprop=name]'),
Selector('[property=og:title]')
], required=True)
recipe_yield = Selector('[itemprop=recipeYield]')
# Recipe instructions
ingredients = AnySelector([
Selector('[itemprop=recipeIngredient]', as_list=True),
Selector('[itemprop=ingredients]', as_list=True),
], required=True)
instructions = Selector('[itemprop=recipeInstructions]', as_list=True, required=True)
# Cooking time
cook_time = AttrSelector('[itemprop=cookTime]', 'datetime')
prep_time = AttrSelector('[itemprop=prepTime]', 'datetime')
total_time = AttrSelector('[itemprop=totalTime]', 'datetime')
# Nutrition
calories = Selector('[itemprop=calories]')
carbohydrate_content = Selector('[itemprop=carbohydrateContent]')
cholesterol_content = Selector('[itemprop=cholesterolContent]')
fat_content = Selector('[itemprop=fatContent]')
protein_content = Selector('[itemprop=proteinContent]')
sodium_content = Selector('[itemprop=sodiumContent]')

+ 2
- 0
soup_schema/error.py View File

@ -0,0 +1,2 @@
class ValidationError(Exception):
pass

+ 31
- 0
soup_schema/schema.py View File

@ -0,0 +1,31 @@
from bs4 import BeautifulSoup
from .selector import Selector
class Schema(object):
__version__ = 1
@classmethod
def _get_selectors(cls):
for name, value in cls.__dict__.items():
if isinstance(value, Selector):
yield name, value
@classmethod
def parse(cls, html):
instance = cls()
soup = BeautifulSoup(html, 'html.parser')
for name, value in cls._get_selectors():
setattr(instance, name, value.resolve(soup))
return instance
def __repr__(self):
properties = []
for name, _ in self.__class__._get_selectors():
value = getattr(self, name, None)
properties.append('{name}={value}'.format(name=name, value=value))
return (
'{name}({properties})'
.format(name=self.__class__.__name__, properties=', '.join(properties))
)

+ 76
- 0
soup_schema/selector.py View File

@ -0,0 +1,76 @@
from bs4 import BeautifulSoup
from .error import ValidationError
class Selector(object):
def __init__(self, selector, required=False, as_list=False):
self.selector = selector
self.as_list = as_list
self.required = required
def _get_value(self, elm):
if elm is None:
return None
if 'content' in elm.attrs:
return elm.attrs['content']
return elm.text
def resolve(self, soup):
if isinstance(soup, (str, bytes)):
soup = BeautifulSoup(soup, 'html.parser')
value = None
if self.as_list:
value = [self._get_value(elm) for elm in soup.select(self.selector)]
else:
elm = soup.select_one(self.selector)
value = self._get_value(elm)
if not value and self.required:
raise ValidationError(
'Expected at least 1 element matching selector "{selector}", none was found'
.format(selector=self.selector)
)
return value
class AttrSelector(Selector):
def __init__(self, selector, attribute, *args, **kwargs):
super(AttrSelector, self).__init__(selector=selector, *args, **kwargs)
self.attribute = attribute
def _get_value(self, elm):
if elm is None:
return None
return elm.attrs.get(self.attribute)
class SchemaSelector(Selector):
def __init__(self, selector, schema, *args, **kwargs):
super(AttrSelector, self).__init__(selector=selector, *args, **kwargs)
self.schema = schema
def _get_value(self, elm):
return self.schema.resolve(elm)
class AnySelector(Selector):
def __init__(self, selectors, required=False):
self.selectors = selectors
self.required = required
def resolve(self, soup):
for selector in self.selectors:
try:
value = selector.resolve(soup)
if value:
return value
except ValidationError:
# DEV: It is ok if one fails, we will try the next one
pass
if self.required:
raise ValidationError(
'Expected at least 1 element matching selector "{selectors}", none was found'
.format(selectors=self.selectors)
)

+ 0
- 0
soup_schema/test/__init__.py View File


Loading…
Cancel
Save