Source code for metapub.findit.findit

__author__ = 'nthmost'

import time
import logging

import requests

from urllib.parse import urlparse

from ..crossref import CrossRefFetcher
from ..exceptions import MetaPubError
from ..utils import asciify
from ..config import DEFAULT_CACHE_DIR
from ..pubmedfetcher import PubMedFetcher
from ..convert import doi2pmid
from ..cache_utils import get_cache_path, SQLiteCache, datetime_to_timestamp

from .logic import find_article_from_pma
from .dances import the_sciencedirect_disco, the_doi_2step, the_wolterskluwer_volta

log = logging.getLogger('metapub.findit')

""" findit/findit.py

    Provides FindIt object, providing a tidy object layer
        into the logic.get_pdf_from_pma function. (see logic.py)

    The FindIt class allows lookups of the PDF starting from only a
    DOI or a PMID, using the following instantiation approaches:

    source = FindIt('1234567')   # assumes argument is a pubmed ID

    source = FindIt(pmid=1234567)  # pmid can be an int or a string

    source = FindIt(doi="10.xxxx/xxx.xxx")   # doi instead of pmid.

    See the FindIt docstring for more information.

    *** IMPORTANT NOTE ***

    In many cases, this code performs intermediary HTTP requests in order to
    scrape a PDF url out of a page, and sometimes tests the url to make sure
    that what's being sent back is in fact a PDF.

    If you would like these requests to go through a proxy (e.g. if you would
    like to prevent making multiple requests of the same servers, which may have
    effects like getting your IP shut off from PubMedCentral), set the
    HTTP_PROXY environment variable in your code or on the command line before
    using any FindIt functionality.
"""

CACHE_FILENAME = 'findit.db'

FINDIT_CACHE = None


pm_fetch = None

def _start_engines():
    global pm_fetch
    if not pm_fetch:
        log.debug('Started FindIt engine.')
        pm_fetch = PubMedFetcher()

def _get_findit_cache(cachedir):
    global FINDIT_CACHE
    # allow swap of cache directory without restarting process.
    # this is mostly for testing but also a few limited use cases.
    if not FINDIT_CACHE:
        _cache_path = get_cache_path(cachedir, CACHE_FILENAME)
        FINDIT_CACHE = SQLiteCache(_cache_path)
        log.info('FindIt Cache initialized at %s', _cache_path)
    return FINDIT_CACHE



[docs]
class FindIt(object):
    """ FindIt

        FindIt helps locate an article's fulltext PDF based on its pubmed ID
        or doi, using the following instantiation approaches:

        source = FindIt('1234567')   # assumes argument is a pubmed ID

        source = FindIt(pmid=1234567)  # pmid can be an int or a string

        source = FindIt(doi="10.xxxx/xxx.xxx")   # doi instead of pmid.

        The machinery in the FindIt object performs all necessary data lookups
        (e.g. looking up a missing DOI, or using a DOI to get a PubMedArticle)
        to end up with a url and reason, which attaches to the FindIt object
        in the following attributes:

        source = FindIt(pmid=PMID)
        source.url
        source.reason
        source.pmid
        source.doi
        source.doi_score

        The "doi_score" is an indication of where the DOI for this PMID ended up
        coming from. If it was supplied by the user or by PubMed, doi_score will be 100.

        If CrossRef came into play during the process to find a DOI that was missing
        for the PubMedArticle object, the doi_score will come from CrossRef (0 to 100).

        Network Timeout Configuration (v0.11+):
        =======================================
        
        FindIt now includes timeout controls to prevent infinite stalling:
        - request_timeout: HTTP request timeout in seconds (default: 10)
        - max_redirects: Maximum redirects to follow (default: 3)
        
        These parameters are applied consistently across all publisher-specific
        strategies to ensure reliable operation.
    """


[docs]
    def __init__(self, pmid=None, cachedir=DEFAULT_CACHE_DIR, **kwargs):
        """Initialize FindIt to locate full-text PDFs for academic papers.

        Args:
            pmid (str or int, optional): PubMed ID of the article to find.
            cachedir (str, optional): Directory for caching results. Defaults to
                system cache directory. Set to None to disable caching.
            **kwargs: Additional keyword arguments:
                doi (str): DOI of the article (alternative to pmid).
                url (str): Pre-existing URL (for testing/validation).
                use_nih (bool): Use NIH access when available. Defaults to False.
                use_crossref (bool): Enable CrossRef fallback for missing DOIs.
                    Defaults to False.
                doi_min_score (int): Minimum CrossRef confidence score for DOI
                    matches. Defaults to 60.
                verify (bool): Verify URLs by testing HTTP response. Defaults to True.
                retry_errors (bool): Retry if cached result has error reasons like
                    "PAYWALL", "TODO", "CANTDO", or "TXERROR". Note: "NOFORMAT"
                    results are always retried. Defaults to False.
                debug (bool): Enable debug logging. Defaults to False.
                tmpdir (str): Temporary directory for downloads. Defaults to '/tmp'.
                request_timeout (int): Timeout in seconds for HTTP requests. Defaults to 10.
                max_redirects (int): Maximum number of redirects to follow. Defaults to 3.

        Raises:
            MetaPubError: If neither pmid nor doi is provided.

        Note:
            After initialization, access results via the `url` and `reason` attributes.
            If url is None, check `reason` for explanation of why PDF wasn't found.
        """

        _start_engines()

        self.pmid = pmid if pmid else kwargs.get('pmid', None)
        self.doi = kwargs.get('doi', None)
        self.url = kwargs.get('url', None)
        self.reason = None
        self.use_nih = kwargs.get('use_nih', False)
        self.use_crossref = kwargs.get('use_crossref', False)

        if self.use_crossref:
            self.crfetch = CrossRefFetcher()
            log.debug('CrossRefFetcher initialized for FindIt.')

        #TODO: revisit this whole score thing (check our CrossRef work, it's been a minute.)
        self.doi_min_score = kwargs.get('doi_min_score', 60)   #60, maybe?
        self.tmpdir = kwargs.get('tmpdir', '/tmp')
        self.doi_score = None
        self.pma = None

        self.verify = kwargs.get('verify', True)
        retry_errors = kwargs.get('retry_errors', False)
        
        # Network timeout and redirect settings
        self.request_timeout = kwargs.get('request_timeout', 10)
        self.max_redirects = kwargs.get('max_redirects', 3)

        # Store cachedir for registry system
        self._cachedir = cachedir

        if cachedir is None:
            self._cache = None
        else:
            self._cache = _get_findit_cache(cachedir)

        self._log = logging.getLogger('metapub.findit')
        if kwargs.get('debug', False):
            self._log.setLevel(logging.DEBUG)
        else:
            self._log.setLevel(logging.INFO)

        if self.pmid:
            self._load_pma_from_pmid()
        elif self.doi:
            self._load_pma_from_doi()
        else:
            raise MetaPubError(
                'Supply either a pmid or a doi to instantiate. e.g. FindIt(pmid=1234567)')

        try:
            if self._cache:
                self.url, self.reason = self.load_from_cache(verify=self.verify, retry_errors=retry_errors)
            else:
                self.url, self.reason = self.load(verify=self.verify)

        except requests.exceptions.ConnectionError as error:
            self.reason = 'TXERROR: %r' % error



[docs]
    def load(self, verify=True):
        """Find full-text PDF URL for the loaded article.

        This method performs the core FindIt logic using publisher-specific
        strategies to locate downloadable PDFs.

        Args:
            verify (bool, optional): Test URLs by making HTTP requests to ensure
                files are downloadable. Setting to False speeds up processing
                significantly. Defaults to True.

        Returns:
            Tuple[Optional[str], Optional[str]]: A tuple of (url, reason).
                - url: Direct link to PDF if found, None otherwise.
                - reason: Explanation if PDF not found (e.g., "PAYWALL", "NOFORMAT").
                  May be None if URL was successfully found.

        Note:
            If a ConnectionError occurs during lookup, returns (None, "TXERROR: <details>").
        """
        return find_article_from_pma(self.pma, use_nih=self.use_nih, verify=verify, 
                                   cachedir=self._cachedir, request_timeout=self.request_timeout,
                                   max_redirects=self.max_redirects)



[docs]
    def load_from_cache(self, verify=True, retry_errors=False):
        """Load article URL from cache, with fallback to fresh lookup.

        Checks cache for previously computed results using article identifiers.
        If not cached or retry_errors is True for error reasons, performs fresh
        lookup and caches the result.

        Args:
            verify (bool, optional): Verify URLs by testing HTTP response.
                Defaults to True.
            retry_errors (bool, optional): Force fresh lookup if cached result
                has error reasons like "TODO", "PAYWALL", "CANTDO", or "TXERROR".
                Note: "NOFORMAT" results are always retried since new publisher
                support is frequently added. Defaults to False.

        Returns:
            Tuple[Optional[str], Optional[str]]: A tuple of (url, reason).
                - url: Direct link to PDF if found, None otherwise.
                - reason: Explanation if PDF not found, None if successful.

        Note:
            Connection errors are not cached to avoid persisting temporary network issues.
        """
        # Always retry NOFORMAT results since new journal support gets added frequently
        retry_reasons = ['NOFORMAT']
        # Optionally retry other error types when requested
        if retry_errors:
            retry_reasons.extend(['PAYWALL', 'TODO', 'CANTDO', 'TXERROR'])

        cache_result = self._query_cache(self.pmid)

        if cache_result:
            url = cache_result['url']
            reason = cache_result.get('reason', '') or ''  # Handle None
            verified = cache_result.get('verify', False)

            # Extract the error code (part before ':' if present)
            reason_code = reason.split(':')[0] if reason else ''

            # Decision logic in ranked order
            # 1. Always retry certain errors.
            # 2. Cache result is unverified && we're still not verifying.
            # 3. Cache result is verified && no error retries called for.

            must_retry = reason_code in retry_reasons

            if not must_retry and (verified or not verify):
                return (url, reason)


        # === RETRY === #
        # we're here for one of the following reasons:
        # 1) no cache result for this query
        # 2) previous result was unverified and now verify=True
        # 3) previous result had a "reason" in retry_reasons
        url, reason = self.load(verify=verify)
        self._store_cache(self.pmid, url=url, reason=reason, verify=verify)
        return (url, reason)


    def _load_pma_from_pmid(self):
        """ Loads self.pma if self.pmid is present.

        Mutates:
            self.doi (using crossref to look this information up if necessary)
            self.doi_score (100 if doi found in self.pma, else crossref score)
        """

        self.pma = pm_fetch.article_by_pmid(self.pmid)

        if self.pma.doi:
            self.doi = self.pma.doi
            self.doi_score = 100
            return

        # if desired, try to learn the DOI using CrossRef
        if self.pma.doi == None:
            if self.use_crossref:
                self._log.debug('Using CrossRef to find DOI for PMID %s', self.pmid)
                work = self.crfetch.article_by_pma(self.pma)
                if work:
                    self.doi = work.doi
                    self.doi_score = work.score
                    self._log.debug('\tFound DOI ', self.doi, ' with score ', self.doi_score)
                else:
                    self._log.debug('\tCrossRef DOI lookup failed for PMID %s.', self.pmid)
                    self.reason = 'MISSING: doi (CrossRef lookup failed)'
            else:
                self.reason = 'MISSING: doi (CrossRef lookups disabled)'

    def _load_pma_from_doi(self):
        """ Loads self.pma if self.doi is present.

        Mutates:
            self.pmid (using metapub.convert.doi2pmid)
            self.pma  (if pmid was found)
            self.doi_score (10.0 if doi found in self.pma, else crossref score)
        """
        self.pmid = doi2pmid(self.doi)
        if self.pmid:
            self.pma = pm_fetch.article_by_pmid(self.pmid)
            self.doi_score = 100
        else:
            raise MetaPubError('Could not get a pmid for doi %s' % self.doi)


[docs]
    def to_dict(self):
        """ Returns a dictionary containing the public attributes of this object"""
        return {'pmid': self.pmid,
                'doi': self.doi,
                'reason': self.reason,
                'url': self.url,
                'doi_score': self.doi_score,
                }


    def _make_cache_key(self, pmid):
        """ Returns normalized key (pmid as integer) for hash lookup / store. """
        return int(pmid)

    def _store_cache(self, cache_key, **kwargs):
        """ Store supplied cache_key pointing to values supplied in kwargs.

        A time.time() timestamp will be added to the value dictionary when stored.

        There is no return from this function. Exceptions from the SQLiteCache
        object may be raised.
        """
        cache_value = kwargs.copy()
        cache_value['timestamp'] = time.time()
        self._cache[self._make_cache_key(cache_key)] = cache_value

    def _query_cache(self, pmid, expiry_date=None):
        """ Return results of a lookup from the cache, if available.
        Return None if not available.

        Cache results are stored with a time.time() timestamp.

        When expiry_date is supplied, results from the cache past their
        sell-by date will be expunged from the cache and return will be None.

        expiry_date can be either a python datetime or a timestamp.

        :param: cache_key: (required)
        :param: expiry_date (optional, default None)
        :rtype: (url, reason) or None
        """

        if hasattr(expiry_date, 'strftime'):
            # convert to timestamp
            sellby = datetime_to_timestamp(expiry_date)
        else:
            # make sure sellby is a number, not None
            sellby = expiry_date if expiry_date else 0

        if self._cache:
            cache_key = self._make_cache_key(pmid)
            try:
                res = self._cache[cache_key]
                timestamp = res['timestamp']
                if timestamp < sellby:
                    self._log.debug('Cache: expunging result for %s (%i)', cache_key, timestamp)
                else:
                    self._log.debug('Cache: returning result for %s (%i)', cache_key, timestamp)
                return res

            except KeyError:
                self._log.debug('Cache: no result for key %s', cache_key)
                return None
        else:
            self._log.debug('Cache disabled (self._cache is None)')
            return None