Source code for metapub.base

from lxml import etree
from lxml_html_clean.clean import Cleaner

from .exceptions import MetaPubError, BaseXMLError






[docs] class MetaPubObject(object): """ Base class for XML parsing objects (e.g. PubMedArticle) """
[docs] def __init__(self, xml, root=None, *args, **kwargs): '''Instantiate with "xml" as string or bytes containing valid XML. Supply name of root element (string) to set virtual top level. (optional).''' if not xml: if xml == '': xml = 'empty' raise MetaPubError( 'Cannot build MetaPubObject; xml string was %s' % xml) self.xml = xml self.content = self.parse_xml(xml, root)
[docs] @staticmethod def parse_xml(xml, root=None): '''Takes xml (str or bytes) and (optionally) a root element definition string. If root element defined, DOM object returned is rebased with this element as root. Args: xml (str or bytes) root (str): (optional) name of root element Returns: lxml document object. ''' if isinstance(xml, str) or isinstance(xml, bytes): dom = etree.XML(xml) else: dom = etree.XML(xml) if root: return dom.find(root) else: return dom
def _get(self, tag): '''Returns content of named XML element, or None if not found.''' elem = self.content.find(tag) return self._extract_text(elem) def _clean_html(self, elem): '''Removes HTML elements like i, b, and a''' cleaner = Cleaner(remove_tags = ['a', 'i', 'b', 'em', 'sup']) return cleaner.clean_html(etree.tostring(elem).decode("utf-8"))\ .replace("<div>", "").replace("</div>", "").strip() def _extract_text(self, elem): if elem is None: return None if len(elem.getchildren()): return self._clean_html(elem) return elem.text
# singleton class used by the fetchers.
[docs] class Borg(object): """ singleton class backing cache engine objects. """ _shared_state = {}
[docs] def __init__(self): self.__dict__ = self._shared_state