from bs4 import BeautifulSoup
|
|
|
|
from .error import ValidationError
|
|
|
|
|
|
class Selector(object):
|
|
def __init__(self, selector, required=False, as_list=False):
|
|
self.selector = selector
|
|
self.as_list = as_list
|
|
self.required = required
|
|
|
|
def _get_value(self, elm):
|
|
if elm is None:
|
|
return None
|
|
if 'content' in elm.attrs:
|
|
return elm.attrs['content']
|
|
return elm.text
|
|
|
|
def resolve(self, soup):
|
|
if isinstance(soup, (str, bytes)):
|
|
soup = BeautifulSoup(soup, 'html.parser')
|
|
value = None
|
|
if self.as_list:
|
|
value = [self._get_value(elm) for elm in soup.select(self.selector)]
|
|
else:
|
|
elm = soup.select_one(self.selector)
|
|
value = self._get_value(elm)
|
|
|
|
if not value and self.required:
|
|
raise ValidationError(
|
|
'Expected at least 1 element matching selector "{selector}", none was found'
|
|
.format(selector=self.selector)
|
|
)
|
|
return value
|
|
|
|
|
|
class AttrSelector(Selector):
|
|
def __init__(self, selector, attribute, *args, **kwargs):
|
|
super(AttrSelector, self).__init__(selector=selector, *args, **kwargs)
|
|
self.attribute = attribute
|
|
|
|
def _get_value(self, elm):
|
|
if elm is None:
|
|
return None
|
|
return elm.attrs.get(self.attribute)
|
|
|
|
|
|
class SchemaSelector(Selector):
|
|
def __init__(self, selector, schema, *args, **kwargs):
|
|
super(AttrSelector, self).__init__(selector=selector, *args, **kwargs)
|
|
self.schema = schema
|
|
|
|
def _get_value(self, elm):
|
|
return self.schema.resolve(elm)
|
|
|
|
|
|
class AnySelector(Selector):
|
|
def __init__(self, selectors, required=False):
|
|
self.selectors = selectors
|
|
self.required = required
|
|
|
|
def resolve(self, soup):
|
|
for selector in self.selectors:
|
|
try:
|
|
value = selector.resolve(soup)
|
|
if value:
|
|
return value
|
|
except ValidationError:
|
|
# DEV: It is ok if one fails, we will try the next one
|
|
pass
|
|
|
|
if self.required:
|
|
raise ValidationError(
|
|
'Expected at least 1 element matching selector "{selectors}", none was found'
|
|
.format(selectors=self.selectors)
|
|
)
|