__author__ = 'nthmost'
import time
import logging
import requests
from urllib.parse import urlparse
from ..crossref import CrossRefFetcher
from ..exceptions import MetaPubError
from ..utils import asciify
from ..config import DEFAULT_CACHE_DIR
from ..pubmedfetcher import PubMedFetcher
from ..convert import doi2pmid
from ..cache_utils import get_cache_path, SQLiteCache, datetime_to_timestamp
from .logic import find_article_from_pma
from .dances import the_sciencedirect_disco, the_doi_2step, the_wolterskluwer_volta
log = logging.getLogger('metapub.findit')
""" findit/findit.py
Provides FindIt object, providing a tidy object layer
into the logic.get_pdf_from_pma function. (see logic.py)
The FindIt class allows lookups of the PDF starting from only a
DOI or a PMID, using the following instantiation approaches:
source = FindIt('1234567') # assumes argument is a pubmed ID
source = FindIt(pmid=1234567) # pmid can be an int or a string
source = FindIt(doi="10.xxxx/xxx.xxx") # doi instead of pmid.
See the FindIt docstring for more information.
*** IMPORTANT NOTE ***
In many cases, this code performs intermediary HTTP requests in order to
scrape a PDF url out of a page, and sometimes tests the url to make sure
that what's being sent back is in fact a PDF.
If you would like these requests to go through a proxy (e.g. if you would
like to prevent making multiple requests of the same servers, which may have
effects like getting your IP shut off from PubMedCentral), set the
HTTP_PROXY environment variable in your code or on the command line before
using any FindIt functionality.
"""
CACHE_FILENAME = 'findit.db'
FINDIT_CACHE = None
pm_fetch = None
def _start_engines():
global pm_fetch
if not pm_fetch:
log.debug('Started FindIt engine.')
pm_fetch = PubMedFetcher()
def _get_findit_cache(cachedir):
global FINDIT_CACHE
# allow swap of cache directory without restarting process.
# this is mostly for testing but also a few limited use cases.
if not FINDIT_CACHE:
_cache_path = get_cache_path(cachedir, CACHE_FILENAME)
FINDIT_CACHE = SQLiteCache(_cache_path)
log.info('FindIt Cache initialized at %s', _cache_path)
return FINDIT_CACHE
[docs]
class FindIt(object):
""" FindIt
FindIt helps locate an article's fulltext PDF based on its pubmed ID
or doi, using the following instantiation approaches:
source = FindIt('1234567') # assumes argument is a pubmed ID
source = FindIt(pmid=1234567) # pmid can be an int or a string
source = FindIt(doi="10.xxxx/xxx.xxx") # doi instead of pmid.
The machinery in the FindIt object performs all necessary data lookups
(e.g. looking up a missing DOI, or using a DOI to get a PubMedArticle)
to end up with a url and reason, which attaches to the FindIt object
in the following attributes:
source = FindIt(pmid=PMID)
source.url
source.reason
source.pmid
source.doi
source.doi_score
The "doi_score" is an indication of where the DOI for this PMID ended up
coming from. If it was supplied by the user or by PubMed, doi_score will be 100.
If CrossRef came into play during the process to find a DOI that was missing
for the PubMedArticle object, the doi_score will come from CrossRef (0 to 100).
Network Timeout Configuration (v0.11+):
=======================================
FindIt now includes timeout controls to prevent infinite stalling:
- request_timeout: HTTP request timeout in seconds (default: 10)
- max_redirects: Maximum redirects to follow (default: 3)
These parameters are applied consistently across all publisher-specific
strategies to ensure reliable operation.
"""
[docs]
def __init__(self, pmid=None, cachedir=DEFAULT_CACHE_DIR, **kwargs):
"""Initialize FindIt to locate full-text PDFs for academic papers.
Args:
pmid (str or int, optional): PubMed ID of the article to find.
cachedir (str, optional): Directory for caching results. Defaults to
system cache directory. Set to None to disable caching.
**kwargs: Additional keyword arguments:
doi (str): DOI of the article (alternative to pmid).
url (str): Pre-existing URL (for testing/validation).
use_nih (bool): Use NIH access when available. Defaults to False.
use_crossref (bool): Enable CrossRef fallback for missing DOIs.
Defaults to False.
doi_min_score (int): Minimum CrossRef confidence score for DOI
matches. Defaults to 60.
verify (bool): Verify URLs by testing HTTP response. Defaults to True.
retry_errors (bool): Retry if cached result has error reasons like
"PAYWALL", "TODO", "CANTDO", or "TXERROR". Note: "NOFORMAT"
results are always retried. Defaults to False.
debug (bool): Enable debug logging. Defaults to False.
tmpdir (str): Temporary directory for downloads. Defaults to '/tmp'.
request_timeout (int): Timeout in seconds for HTTP requests. Defaults to 10.
max_redirects (int): Maximum number of redirects to follow. Defaults to 3.
Raises:
MetaPubError: If neither pmid nor doi is provided.
Note:
After initialization, access results via the `url` and `reason` attributes.
If url is None, check `reason` for explanation of why PDF wasn't found.
"""
_start_engines()
self.pmid = pmid if pmid else kwargs.get('pmid', None)
self.doi = kwargs.get('doi', None)
self.url = kwargs.get('url', None)
self.reason = None
self.use_nih = kwargs.get('use_nih', False)
self.use_crossref = kwargs.get('use_crossref', False)
if self.use_crossref:
self.crfetch = CrossRefFetcher()
log.debug('CrossRefFetcher initialized for FindIt.')
#TODO: revisit this whole score thing (check our CrossRef work, it's been a minute.)
self.doi_min_score = kwargs.get('doi_min_score', 60) #60, maybe?
self.tmpdir = kwargs.get('tmpdir', '/tmp')
self.doi_score = None
self.pma = None
self.verify = kwargs.get('verify', True)
retry_errors = kwargs.get('retry_errors', False)
# Network timeout and redirect settings
self.request_timeout = kwargs.get('request_timeout', 10)
self.max_redirects = kwargs.get('max_redirects', 3)
# Store cachedir for registry system
self._cachedir = cachedir
if cachedir is None:
self._cache = None
else:
self._cache = _get_findit_cache(cachedir)
self._log = logging.getLogger('metapub.findit')
if kwargs.get('debug', False):
self._log.setLevel(logging.DEBUG)
else:
self._log.setLevel(logging.INFO)
if self.pmid:
self._load_pma_from_pmid()
elif self.doi:
self._load_pma_from_doi()
else:
raise MetaPubError(
'Supply either a pmid or a doi to instantiate. e.g. FindIt(pmid=1234567)')
try:
if self._cache:
self.url, self.reason = self.load_from_cache(verify=self.verify, retry_errors=retry_errors)
else:
self.url, self.reason = self.load(verify=self.verify)
except requests.exceptions.ConnectionError as error:
self.reason = 'TXERROR: %r' % error
[docs]
def load(self, verify=True):
"""Find full-text PDF URL for the loaded article.
This method performs the core FindIt logic using publisher-specific
strategies to locate downloadable PDFs.
Args:
verify (bool, optional): Test URLs by making HTTP requests to ensure
files are downloadable. Setting to False speeds up processing
significantly. Defaults to True.
Returns:
Tuple[Optional[str], Optional[str]]: A tuple of (url, reason).
- url: Direct link to PDF if found, None otherwise.
- reason: Explanation if PDF not found (e.g., "PAYWALL", "NOFORMAT").
May be None if URL was successfully found.
Note:
If a ConnectionError occurs during lookup, returns (None, "TXERROR: <details>").
"""
return find_article_from_pma(self.pma, use_nih=self.use_nih, verify=verify,
cachedir=self._cachedir, request_timeout=self.request_timeout,
max_redirects=self.max_redirects)
[docs]
def load_from_cache(self, verify=True, retry_errors=False):
"""Load article URL from cache, with fallback to fresh lookup.
Checks cache for previously computed results using article identifiers.
If not cached or retry_errors is True for error reasons, performs fresh
lookup and caches the result.
Args:
verify (bool, optional): Verify URLs by testing HTTP response.
Defaults to True.
retry_errors (bool, optional): Force fresh lookup if cached result
has error reasons like "TODO", "PAYWALL", "CANTDO", or "TXERROR".
Note: "NOFORMAT" results are always retried since new publisher
support is frequently added. Defaults to False.
Returns:
Tuple[Optional[str], Optional[str]]: A tuple of (url, reason).
- url: Direct link to PDF if found, None otherwise.
- reason: Explanation if PDF not found, None if successful.
Note:
Connection errors are not cached to avoid persisting temporary network issues.
"""
# Always retry NOFORMAT results since new journal support gets added frequently
retry_reasons = ['NOFORMAT']
# Optionally retry other error types when requested
if retry_errors:
retry_reasons.extend(['PAYWALL', 'TODO', 'CANTDO', 'TXERROR'])
cache_result = self._query_cache(self.pmid)
if cache_result:
url = cache_result['url']
reason = cache_result.get('reason', '') or '' # Handle None
verified = cache_result.get('verify', False)
# Extract the error code (part before ':' if present)
reason_code = reason.split(':')[0] if reason else ''
# Decision logic in ranked order
# 1. Always retry certain errors.
# 2. Cache result is unverified && we're still not verifying.
# 3. Cache result is verified && no error retries called for.
must_retry = reason_code in retry_reasons
if not must_retry and (verified or not verify):
return (url, reason)
# === RETRY === #
# we're here for one of the following reasons:
# 1) no cache result for this query
# 2) previous result was unverified and now verify=True
# 3) previous result had a "reason" in retry_reasons
url, reason = self.load(verify=verify)
self._store_cache(self.pmid, url=url, reason=reason, verify=verify)
return (url, reason)
def _load_pma_from_pmid(self):
""" Loads self.pma if self.pmid is present.
Mutates:
self.doi (using crossref to look this information up if necessary)
self.doi_score (100 if doi found in self.pma, else crossref score)
"""
self.pma = pm_fetch.article_by_pmid(self.pmid)
if self.pma.doi:
self.doi = self.pma.doi
self.doi_score = 100
return
# if desired, try to learn the DOI using CrossRef
if self.pma.doi == None:
if self.use_crossref:
self._log.debug('Using CrossRef to find DOI for PMID %s', self.pmid)
work = self.crfetch.article_by_pma(self.pma)
if work:
self.doi = work.doi
self.doi_score = work.score
self._log.debug('\tFound DOI ', self.doi, ' with score ', self.doi_score)
else:
self._log.debug('\tCrossRef DOI lookup failed for PMID %s.', self.pmid)
self.reason = 'MISSING: doi (CrossRef lookup failed)'
else:
self.reason = 'MISSING: doi (CrossRef lookups disabled)'
def _load_pma_from_doi(self):
""" Loads self.pma if self.doi is present.
Mutates:
self.pmid (using metapub.convert.doi2pmid)
self.pma (if pmid was found)
self.doi_score (10.0 if doi found in self.pma, else crossref score)
"""
self.pmid = doi2pmid(self.doi)
if self.pmid:
self.pma = pm_fetch.article_by_pmid(self.pmid)
self.doi_score = 100
else:
raise MetaPubError('Could not get a pmid for doi %s' % self.doi)
[docs]
def to_dict(self):
""" Returns a dictionary containing the public attributes of this object"""
return {'pmid': self.pmid,
'doi': self.doi,
'reason': self.reason,
'url': self.url,
'doi_score': self.doi_score,
}
def _make_cache_key(self, pmid):
""" Returns normalized key (pmid as integer) for hash lookup / store. """
return int(pmid)
def _store_cache(self, cache_key, **kwargs):
""" Store supplied cache_key pointing to values supplied in kwargs.
A time.time() timestamp will be added to the value dictionary when stored.
There is no return from this function. Exceptions from the SQLiteCache
object may be raised.
"""
cache_value = kwargs.copy()
cache_value['timestamp'] = time.time()
self._cache[self._make_cache_key(cache_key)] = cache_value
def _query_cache(self, pmid, expiry_date=None):
""" Return results of a lookup from the cache, if available.
Return None if not available.
Cache results are stored with a time.time() timestamp.
When expiry_date is supplied, results from the cache past their
sell-by date will be expunged from the cache and return will be None.
expiry_date can be either a python datetime or a timestamp.
:param: cache_key: (required)
:param: expiry_date (optional, default None)
:rtype: (url, reason) or None
"""
if hasattr(expiry_date, 'strftime'):
# convert to timestamp
sellby = datetime_to_timestamp(expiry_date)
else:
# make sure sellby is a number, not None
sellby = expiry_date if expiry_date else 0
if self._cache:
cache_key = self._make_cache_key(pmid)
try:
res = self._cache[cache_key]
timestamp = res['timestamp']
if timestamp < sellby:
self._log.debug('Cache: expunging result for %s (%i)', cache_key, timestamp)
else:
self._log.debug('Cache: returning result for %s (%i)', cache_key, timestamp)
return res
except KeyError:
self._log.debug('Cache: no result for key %s', cache_key)
return None
else:
self._log.debug('Cache disabled (self._cache is None)')
return None