Source code for metapub.dx_doi

import logging
import requests
import certifi
from requests.adapters import HTTPAdapter
import urllib3
from urllib3.util import Retry

from .cache_utils import SQLiteCache, get_cache_path
from .base import Borg
from .config import DEFAULT_CACHE_DIR
from .exceptions import BadDOI, DxDOIError
from .text_mining import find_doi_in_string

DX_DOI_URL = 'http://dx.doi.org/%s'
CACHE_FILENAME = 'dx_doi-cache.db'

DX_DOI_CACHE = None

def _get_dx_doi_cache(cachedir=DEFAULT_CACHE_DIR):
    global DX_DOI_CACHE
    if not DX_DOI_CACHE:
        _cache_path = get_cache_path(cachedir, CACHE_FILENAME)
        DX_DOI_CACHE = SQLiteCache(_cache_path)
    return DX_DOI_CACHE



[docs]
class DxDOI(Borg):
    """ Looks up DOIs in dx.doi.org and caches results in an SQLite
    cache. This is a Borg singleton object.

    Methods:

        resolve (doi, *args): uses supplied doi to get link to publisher.

        check_doi (doi, *args): returns doi if supplied DOI is good,
                                raises BadDOI if not good.
    """


[docs]
    def __init__(self, retries=1, **kwargs):
        self._log = logging.getLogger('metapub.DxDOI')
        self._log.setLevel(logging.INFO)
        self.retries = retries
        cachedir = kwargs.get('cachedir', DEFAULT_CACHE_DIR)
        self._cache = _get_dx_doi_cache(cachedir)


    def _create_session(self):
        session = requests.Session()
        retry_strategy = Retry(
            total=self.retries,  # Total number of retries
            backoff_factor=0.1,  # Don't wait long for retries
            status_forcelist=[429, 500, 502, 503, 504],  # Retry on these status codes
            allowed_methods=["HEAD", "GET", "OPTIONS"],
            raise_on_status=False,
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Referer': 'http://dx.doi.org'
        })
        return session


[docs]
    def check_doi(self, doi, whitespace=False):
        """ Checks validity of supplied doi.

        If whitespace is True (default False), allows supplied doi to
        contain whitespace.

        :param doi: (str)
        :param whitespace: (bool)
        :return: doi (str) -- verified DOI)
        :raise BadDOI if supplied DOI fails regular expression check.
        """

        result_doi = find_doi_in_string(doi, whitespace=False)
        if result_doi is None:
            raise BadDOI('Supplied DOI "%s" fails doi check' % doi)
        return doi


    def _query_api(self, doi):
        session = self._create_session()
        response = None
        try:
            response = session.get(DX_DOI_URL % doi, allow_redirects=True, verify=certifi.where(), timeout=5)
            response.raise_for_status()
            if response.status_code in [200, 202, 301, 302, 307, 308, 402, 403]:
                self._log.info(f'URL is accessible: {response.url} (Status code: {response.status_code})')
                self._cache[doi] = response.url
                return response.url
        except requests.exceptions.RequestException as e:
            if response is not None and response.status_code in [402, 403, 408, 429]:
                self._log.info(f'URL returned status code {response.status_code}: {response.url}')
                self._cache[doi] = response.url
                return response.url
            elif isinstance(e, requests.exceptions.ConnectionError):
                self._log.error(f'Connection error for URL: {DX_DOI_URL % doi}')
            raise DxDOIError(f'Error processing DOI {doi}: {str(e)}')
        finally:
            session.close()


[docs]
    def resolve(self, doi, check_doi=True, whitespace=False, skip_cache=False):
        """ Takes a doi (string), returns a url to article page on journal website.

        if check_doi is True (default True), checks DOI before
        submitting query to dx.doi.org.

        if whitespace is True (default False), allows prospective
        dois to contain whitespace when checked.

        if skip_cache is True (default False), doesn't check cache for
        pre-existing results (loads from remote dx.doi.org).

        :param doi: (str)
        :param check_doi: (bool)
        :param whitespace: (bool)
        :param skip_cache: (bool)
        :return: url (str)
        :raises BadDOI: if supplied DOI failed regular expression check
        :raises DxDOIError: if not-ok HTTP status code while loading url
        :raises ConnectionError: if problem making dx.doi.org connection
        """
        if doi is None or doi.strip()=='':
            raise BadDOI('DOI cannot be None or empty string')

        if check_doi:
            doi = self.check_doi(doi, whitespace=whitespace)
            
        url = None
        if not skip_cache:
            url = self._query_cache(doi)

        if url == None:
            url = self._query_api(doi)

            if self._cache:
                cache_key = self._make_cache_key(doi)
                self._cache[cache_key] = url
                self._log.info('cached results for key {cache_key} ({doi}) '.format(
                        cache_key=cache_key, doi=doi))
        return url


    def _make_cache_key(self, inp):
        return inp.strip()

    def _query_cache(self, key):
        """ Return results for a cache lookup, if found.

        :param key: (str)
        :return: val (str) or None
        """
        if self._cache:
            cache_key = self._make_cache_key(key)
            try:
                val = self._cache[cache_key]
                self._log.debug('cache hit for key {cache_key} ({key}) '.format(
                    cache_key=cache_key, key=key))
                return val
            except KeyError:
                self._log.debug('cache miss for key {cache_key} ({key}) '.format(
                        cache_key=cache_key, key=key))
                return None
        else:
            self._log.debug('cache disabled (self._cache is None)')
            return None