Source code for metapub.convert

"""convert.py: mildly-experimental mashups of various services to get needed IDs.

Defines command-line tools `convert pmid2doi` and `convert doi2pmid`.
"""

import logging
from urllib.error import HTTPError

from .pubmedfetcher import PubMedFetcher
from .crossref import CrossRefFetcher
from .exceptions import *
from .ncbi_errors import NCBIServiceError

try:
    from docopt import docopt
except ImportError:
    docopt = None

log = logging.getLogger('metapub.convert')

cr_fetch = None   #CrossRefFetcher()
pm_fetch = None   #PubMedFetcher()


__version__ = '0.1'
__doc__ = """Convert.pmid2doi / Convert.doi2pmid / Convert.bookid2pmid

Usage:
    convert -h 
    convert pmid2doi <pmid> [options]
    convert doi2pmid <doi>  [options]
    convert bookid2pmid <book_id>  [options]

Options:
    -h, --help      Show this help page
    -v, --version   Show this command's version.
    -q, --quiet     Shut up all that log garbage.
    -d, --debug     No wait, give me ALL the log garbage! Superceded by --quiet.
    -a, --article   Also print out the article information (from PubMedArticle) if possible.
    -w, --work      Also print out info from the CrossRef entry, if possible.
"""


def _start_engines():
    global cr_fetch
    global pm_fetch
    if not cr_fetch:
        cr_fetch = CrossRefFetcher()
    if not pm_fetch:
        pm_fetch = PubMedFetcher()
    


[docs]
def interpret_pmids_for_citation_results(pmids):
    if len(pmids) == 1:
        if pmids[0] == 'NOT_FOUND':
            return None
        elif pmids[0].startswith('AMBIGUOUS'):
            return 'AMBIGUOUS'
        return str(pmids[0])
    elif len(pmids) == 0:
        return None
    else:
        return 'AMBIGUOUS'




[docs]
def PubMedArticle2doi(pma):
    '''Starting with a PubMedArticle object, use CrossRef to find a DOI for given article.

    Args:
        pma (PubMedArticle)

    Returns:
        doi (str) or None
    '''
    _start_engines()

    work = cr_fetch.article_by_pma(pma)
    if work:
        log.debug('CrossRefWork found (%s) with Crossref score %i.', work.doi, work.score)
        return work.doi

    return None




[docs]
def pmid2doi(pmid):
    '''starting with a pubmed ID, lookup article in pubmed. If DOI found in PubMedArticle object,
        return it.  Otherwise, use CrossRef to find the DOI for given article.

    Args:
        pmid (str or int)

    Returns:
        doi (str) or None

    Raises:
        InvalidPMID (if pmid is invalid)
        NCBIServiceError (if NCBI services are down)
    '''
    try:
        # let MetaPubError pass back to the caller if pmid is not for realz..
        _start_engines()
        pma = pm_fetch.article_by_pmid(pmid)
        if pma.doi:
            log.debug('PMID %s: Found DOI in MedLine XML.', pmid)
            return pma.doi
        return PubMedArticle2doi(pma)
    except NCBIServiceError:
        # Re-raise NCBI service errors with enhanced context
        raise
    except Exception as e:
        # Check if this might be a service issue
        if any(keyword in str(e).lower() for keyword in [
            'connection', 'timeout', 'server error', 'xml', 'parse'
        ]):
            raise NCBIServiceError(
                f"Unable to convert PMID {pmid} to DOI due to service issues.",
                'conversion_error',
                [
                    'Check NCBI service status with: ncbi_health_check --quick',
                    'Try again in a few minutes',
                    'Verify the PMID is correct',
                    'Check your internet connection'
                ]
            ) from e
        else:
            raise




[docs]
def doi2pmid(doi):
    '''uses CrossRef and PubMed eutils to lookup a PMID given a known doi.

    Warning: NO validation of input DOI performed here. Use
             metapub.text_mining.find_doi_in_string beforehand if needed.

    If a PMID can be found, return it. Otherwise return None.

    In very rare cases, use of the CrossRef->pubmed citation method used
    here may result in more than one pubmed ID. In this case, this function
    will return instead the word 'AMBIGUOUS'.

    :param pmid: (str or int)
    :return doi: (str) if found; 'AMBIGUOUS' if citation count > 1; None if no results.
    :raises: NCBIServiceError if NCBI services are down
    '''
    try:
        # for PMA, skip the validation; some pubmed XML has weird partial strings for DOI.
        # We should allow people to search using these oddball strings.
        _start_engines()
        doi = doi.strip()
        try:
            pma = pm_fetch.article_by_doi(doi)
            log.debug('doi2pmid: Found PubMedArticle for DOI %s via eutils fetch', doi)
            return pma.pmid
        except NCBIServiceError:
            raise  # Re-raise service errors
        except:
            pass

        # Try doing a DOI lookup right in an advanced query string. Sometimes works and has
        # benefit of being a cached query so it is quick to do again, should we need.
        pmids = pm_fetch.pmids_for_query(doi)
        if len(pmids) == 1:
            # we need to cross-check; pubmed sometimes screws us over by giving us an article
            # with a SIMILAR doi. *facepalm*
            pma = pm_fetch.article_by_pmid(pmids[0])
            if pma.doi == doi:
                log.debug('doi2pmid: Found PMID via PubMed advanced query for DOI %s', doi)
                return pma.pmid

            log.debug('Pubmed advanced query gave us a problematic result...')
    except NCBIServiceError:
        # Re-raise NCBI service errors
        raise
    except Exception as e:
        # Check if this might be a service issue
        if any(keyword in str(e).lower() for keyword in [
            'connection', 'timeout', 'server error', 'xml', 'parse'
        ]):
            raise NCBIServiceError(
                f"Unable to convert DOI {doi} to PMID due to service issues.",
                'conversion_error',
                [
                    'Check NCBI service status with: ncbi_health_check --quick',
                    'Try again in a few minutes',
                    'Verify the DOI is correct',
                    'Check your internet connection'
                ]
            ) from e
        else:
            # Continue with normal error handling for non-service issues
            pass

    # Try Looking up DOI in CrossRef, then feeding results to pubmed citation query tool...
    try:
        work = cr_fetch.article_by_doi(doi)
        log.debug('doi2pmid: Found CrossRef article for DOI %s', doi)
    except HTTPError as error:
        if str(error).find('404') > -1:
            log.info('doi2pmid: DOI %s was not found in CrossRef.  Giving up.', doi)
            return None
        log.debug('doi2pmid: Unexpected HTTP error occurred during CrossRef lookup:')
        log.debug(error)
        return None

    try:
        pmids = pm_fetch.pmids_for_citation(**work.to_citation())

        if pmids:
            return interpret_pmids_for_citation_results(pmids)
        else:
            return None
    except NCBIServiceError:
        # Re-raise NCBI service errors from citation search
        raise
    except Exception as e:
        # Handle citation search errors
        if any(keyword in str(e).lower() for keyword in [
            'connection', 'timeout', 'server error', 'xml', 'parse'
        ]):
            raise NCBIServiceError(
                f"Unable to search PubMed citations for DOI {doi} due to service issues.",
                'conversion_error',
                [
                    'Check NCBI service status with: ncbi_health_check --quick',
                    'Try again in a few minutes',
                    'Check your internet connection'
                ]
            ) from e
        else:
            raise




[docs]
def bookid2pmid(book_id):
    """Convenience interface to PubMedFetcher.pmid_for_bookID"""
    _start_engines()
    return pm_fetch.pmid_for_bookID(book_id)




[docs]
def main():
    if docopt is None:
        raise ImportError("docopt is required to run main()")
    args = docopt(__doc__, version=__version__)

    if args['--quiet']:
        logging.getLogger('metapub.*').setLevel(logging.INFO)

    elif args['--debug']:
        logging.getLogger('metapub.*').setLevel(logging.DEBUG)

    if args['doi2pmid']:
        doi = args['<doi>']
        print('DOI: ', doi)
        pmid = doi2pmid(doi)
        print('PMID: ', pmid)
    elif args['pmid2doi']:
        pmid = args['<pmid>']
        print('PMID: ', pmid)
        doi = pmid2doi(pmid)
        print('DOI: ', doi)
    elif args['bookid2pmid']:
        book_id = args['<book_id>']
        print('BookID: ', book_id)
        pmid = bookid2pmid(book_id)
        print('PMID: ', pmid) 

    print()
    if args['--article']:
        pma = pm_fetch.article_by_pmid(pmid)
        print(pma)

    if args['--work']:
        work = cr_fetch.article_by_doi(doi)
        print(work)