Source code for metapub.convert

"""convert.py: mildly-experimental mashups of various services to get needed IDs.

Defines command-line tools `convert pmid2doi` and `convert doi2pmid`.
"""

import logging
from urllib.error import HTTPError

from .pubmedfetcher import PubMedFetcher
from .crossref import CrossRefFetcher
from .exceptions import *
from .ncbi_errors import NCBIServiceError

try:
    from docopt import docopt
except ImportError:
    docopt = None

log = logging.getLogger('metapub.convert')

cr_fetch = None   #CrossRefFetcher()
pm_fetch = None   #PubMedFetcher()


__version__ = '0.1'
__doc__ = """Convert.pmid2doi / Convert.doi2pmid / Convert.bookid2pmid

Usage:
    convert -h 
    convert pmid2doi <pmid> [options]
    convert doi2pmid <doi>  [options]
    convert bookid2pmid <book_id>  [options]

Options:
    -h, --help      Show this help page
    -v, --version   Show this command's version.
    -q, --quiet     Shut up all that log garbage.
    -d, --debug     No wait, give me ALL the log garbage! Superceded by --quiet.
    -a, --article   Also print out the article information (from PubMedArticle) if possible.
    -w, --work      Also print out info from the CrossRef entry, if possible.
"""


def _start_engines():
    global cr_fetch
    global pm_fetch
    if not cr_fetch:
        cr_fetch = CrossRefFetcher()
    if not pm_fetch:
        pm_fetch = PubMedFetcher()
    

[docs] def interpret_pmids_for_citation_results(pmids): if len(pmids) == 1: if pmids[0] == 'NOT_FOUND': return None elif pmids[0].startswith('AMBIGUOUS'): return 'AMBIGUOUS' return str(pmids[0]) elif len(pmids) == 0: return None else: return 'AMBIGUOUS'
[docs] def PubMedArticle2doi(pma): '''Starting with a PubMedArticle object, use CrossRef to find a DOI for given article. Args: pma (PubMedArticle) Returns: doi (str) or None ''' _start_engines() work = cr_fetch.article_by_pma(pma) if work: log.debug('CrossRefWork found (%s) with Crossref score %i.', work.doi, work.score) return work.doi return None
[docs] def pmid2doi(pmid): '''starting with a pubmed ID, lookup article in pubmed. If DOI found in PubMedArticle object, return it. Otherwise, use CrossRef to find the DOI for given article. Args: pmid (str or int) Returns: doi (str) or None Raises: InvalidPMID (if pmid is invalid) NCBIServiceError (if NCBI services are down) ''' try: # let MetaPubError pass back to the caller if pmid is not for realz.. _start_engines() pma = pm_fetch.article_by_pmid(pmid) if pma.doi: log.debug('PMID %s: Found DOI in MedLine XML.', pmid) return pma.doi return PubMedArticle2doi(pma) except NCBIServiceError: # Re-raise NCBI service errors with enhanced context raise except Exception as e: # Check if this might be a service issue if any(keyword in str(e).lower() for keyword in [ 'connection', 'timeout', 'server error', 'xml', 'parse' ]): raise NCBIServiceError( f"Unable to convert PMID {pmid} to DOI due to service issues.", 'conversion_error', [ 'Check NCBI service status with: ncbi_health_check --quick', 'Try again in a few minutes', 'Verify the PMID is correct', 'Check your internet connection' ] ) from e else: raise
[docs] def doi2pmid(doi): '''uses CrossRef and PubMed eutils to lookup a PMID given a known doi. Warning: NO validation of input DOI performed here. Use metapub.text_mining.find_doi_in_string beforehand if needed. If a PMID can be found, return it. Otherwise return None. In very rare cases, use of the CrossRef->pubmed citation method used here may result in more than one pubmed ID. In this case, this function will return instead the word 'AMBIGUOUS'. :param pmid: (str or int) :return doi: (str) if found; 'AMBIGUOUS' if citation count > 1; None if no results. :raises: NCBIServiceError if NCBI services are down ''' try: # for PMA, skip the validation; some pubmed XML has weird partial strings for DOI. # We should allow people to search using these oddball strings. _start_engines() doi = doi.strip() try: pma = pm_fetch.article_by_doi(doi) log.debug('doi2pmid: Found PubMedArticle for DOI %s via eutils fetch', doi) return pma.pmid except NCBIServiceError: raise # Re-raise service errors except: pass # Try doing a DOI lookup right in an advanced query string. Sometimes works and has # benefit of being a cached query so it is quick to do again, should we need. pmids = pm_fetch.pmids_for_query(doi) if len(pmids) == 1: # we need to cross-check; pubmed sometimes screws us over by giving us an article # with a SIMILAR doi. *facepalm* pma = pm_fetch.article_by_pmid(pmids[0]) if pma.doi == doi: log.debug('doi2pmid: Found PMID via PubMed advanced query for DOI %s', doi) return pma.pmid log.debug('Pubmed advanced query gave us a problematic result...') except NCBIServiceError: # Re-raise NCBI service errors raise except Exception as e: # Check if this might be a service issue if any(keyword in str(e).lower() for keyword in [ 'connection', 'timeout', 'server error', 'xml', 'parse' ]): raise NCBIServiceError( f"Unable to convert DOI {doi} to PMID due to service issues.", 'conversion_error', [ 'Check NCBI service status with: ncbi_health_check --quick', 'Try again in a few minutes', 'Verify the DOI is correct', 'Check your internet connection' ] ) from e else: # Continue with normal error handling for non-service issues pass # Try Looking up DOI in CrossRef, then feeding results to pubmed citation query tool... try: work = cr_fetch.article_by_doi(doi) log.debug('doi2pmid: Found CrossRef article for DOI %s', doi) except HTTPError as error: if str(error).find('404') > -1: log.info('doi2pmid: DOI %s was not found in CrossRef. Giving up.', doi) return None log.debug('doi2pmid: Unexpected HTTP error occurred during CrossRef lookup:') log.debug(error) return None try: pmids = pm_fetch.pmids_for_citation(**work.to_citation()) if pmids: return interpret_pmids_for_citation_results(pmids) else: return None except NCBIServiceError: # Re-raise NCBI service errors from citation search raise except Exception as e: # Handle citation search errors if any(keyword in str(e).lower() for keyword in [ 'connection', 'timeout', 'server error', 'xml', 'parse' ]): raise NCBIServiceError( f"Unable to search PubMed citations for DOI {doi} due to service issues.", 'conversion_error', [ 'Check NCBI service status with: ncbi_health_check --quick', 'Try again in a few minutes', 'Check your internet connection' ] ) from e else: raise
[docs] def bookid2pmid(book_id): """Convenience interface to PubMedFetcher.pmid_for_bookID""" _start_engines() return pm_fetch.pmid_for_bookID(book_id)
[docs] def main(): if docopt is None: raise ImportError("docopt is required to run main()") args = docopt(__doc__, version=__version__) if args['--quiet']: logging.getLogger('metapub.*').setLevel(logging.INFO) elif args['--debug']: logging.getLogger('metapub.*').setLevel(logging.DEBUG) if args['doi2pmid']: doi = args['<doi>'] print('DOI: ', doi) pmid = doi2pmid(doi) print('PMID: ', pmid) elif args['pmid2doi']: pmid = args['<pmid>'] print('PMID: ', pmid) doi = pmid2doi(pmid) print('DOI: ', doi) elif args['bookid2pmid']: book_id = args['<book_id>'] print('BookID: ', book_id) pmid = bookid2pmid(book_id) print('PMID: ', pmid) print() if args['--article']: pma = pm_fetch.article_by_pmid(pmid) print(pma) if args['--work']: work = cr_fetch.article_by_doi(doi) print(work)