Source code for metapub.urlreverse.methods

import re

from ..dx_doi import DxDOI
from ..exceptions import DxDOIError, BadDOI
from ..text_mining import find_doi_in_string, scrape_doi_from_article_page
from ..utils import hostname_of, rootdomain_of

from .hostname2jrnl import HOSTNAME_TO_JOURNAL_MAP
from .hostname2doiprefix import HOSTNAME_TO_DOI_PREFIX_MAP


# string templates
OFFICIAL_PII_FORMAT = '{pt1}-{pt2}({pt3}){pt4}-{pt5}'


# VIP (volume-issue-page)
re_vip = re.compile(r'(?P<hostname>.*?)\/content(\/\w+)?\/(?P<volume>\d+)\/(?P<issue>\w+)\/(?P<first_page>\w+)', re.I)

# PMID in url
re_pmidlookup = re.compile(r'.*?(\?|&)pmid=(?P<pmid>\d+)', re.I)
re_pubmed_pmid = re.compile(r'.*?ncbi.nlm.nih.gov\/pubmed\/(?P<pmid>\d+)')

# PMCID in url
re_pmcid = re.compile(r'.*?(?P<hostname>ncbi.nlm.nih.gov|europepmc.org)\/.*?(?P<pmcid>PMC\d+)', re.I)

# PII -- see http://en.wikipedia.org/wiki/Publisher_Item_Identifier
pii_official = r'(?P<pii>S\d{4}-\d{4}\(\d{2}\)\d{5}-\w{1})'
re_sciencedirect_pii_simple = re.compile(r'.*?(?P<hostname>sciencedirect\.com)\/science\/article\/pii\/(?P<pii>S\d+\w?)', re.I)
re_sciencedirect_pii_official = re.compile(r'.*?(?P<hostname>sciencedirect\.com)\/science\/article\/pii\/' + pii_official, re.I)
re_cell_pii_simple = re.compile(r'.*?(?P<hostname>cell\.com)\/(?P<journal_abbrev>.*?)\/(pdf|abstract|fulltext|pdfExtended)\/(?P<pii>S\d+)', re.I)
re_cell_pii_official = re.compile(r'.*?cell.com\/((?P<journal_abbrev>.*?)\/)?(pdf|abstract|fulltext|pdfExtended)\/' + pii_official, re.I)
re_cell_old_style = re.compile(r'.*?(?P<hostname>cell\.com)\/(pdf|abstract|fulltext)\/(?P<pii>\d+)', re.I)

# Unique
re_jstage = re.compile(r'.*?(?P<hostname>jstage\.jst\.go\.jp)\/article\/(?P<journal_abbrev>.*?)\/(?P<volume>\d+)\/(?P<issue>.*?)\/(?P<info>).*?\/', re.I)
re_jci = re.compile(r'.*?jci\.org\/articles\/view\/(?P<jci_id>\d+)', re.I)
re_karger = re.compile(r'.*?(?P<hostname>karger\.com)\/Article\/(Abstract|Pdf)\/(?P<kid>\d+)', re.I)
#re_ahajournals = re.compile('\/(?P<doi_suffix>\w+\.\d+\.\d+\.\w+)', re.I)
re_ahajournals = re.compile(r'\/(?P<doi_suffix>[a-z0-9]+\.\d+\.\d+\.[a-z0-9]+)', re.I)
re_elifesciences = re.compile(r'(^|http?:\/\/)elifesciences.org\/content\/(?P<volume>\d+)\/e(?P<ident>\d+)', re.I)
re_elifesciences_figures = re.compile(r'elifesciences\.org\/elife-articles\/(?P<ident>\d+)\/figures-pdf\/', re.I)

re_bmj = re.compile(r'(^|http?:\/\/)(?P<subdomain>\w+)\.bmj.com\/content\/(?P<volume>\d+)\/(?P<doi_suffix>bmj.\w+)', re.I)
re_bmj_vip_to_doi = re.compile(r'(^|http?:\/\/)(?P<subdomain>\w+).bmj.com\/content\/(?P<volume>\d+)\/(?P<issue>\d+)\/(?P<first_page>\w+)', re.I)

# Early release formats
re_early_release = re.compile(r'(^|(http?):\/\/)(?P<hostname>.*?)\/content(\/\w+)?\/early\/(?P<year>\d+)\/(?P<month>\d+)\/(?P<day>\d+)\/(?P<doi_suffix>.*?)(\.full|\.pdf|\.abstract|$)')


# TODO: Common supplement URL format
#re_supplement_common = re.compile()
# http://jmg.bmj.com/content/suppl/2012/05/09/jmedgenet-2012-100892.DC1/Otocephaly_Supplementary_Table_3.pdf
# http://www.pnas.org/content/suppl/2013/07/08/1305207110.DCSupplemental/sapp.pdf
# http://jmg.bmj.com/content/suppl/2015/07/17/jmedgenet-2015-103132.DC1/jmedgenet-2015-103132supp.pdf

re_pnas_supplement = re.compile(r'.*?pnas.org\/content\/suppl\/(?P<year>\d+)\/(?P<month>\d+)\/(?P<day>\d+)\/(?P<ident>.*?)\/', re.I)

# dx.doi.org self-cacheing lookup engine.
DXDOI_INSTANCE = None


[docs]
def DXDOI():
    global DXDOI_INSTANCE
    if not DXDOI_INSTANCE:
        DXDOI_INSTANCE = DxDOI()
    return DXDOI_INSTANCE




[docs]
def get_journal_name_from_url(url):
    if not url.lower().startswith('http'):
        url = 'http://' + url

    hostname = hostname_of(url)

    if hostname in HOSTNAME_TO_JOURNAL_MAP.keys():
        return HOSTNAME_TO_JOURNAL_MAP[hostname]
    else:
        return None




[docs]
def get_pnas_doi_from_link(url):
    """ PNAS (proceedings of the national academy of sciences of the USA)

    Examples:
        http://www.pnas.org/content/suppl/2013/07/08/1305207110.DCSupplemental/sapp.pdf --> 10.1073/pnas.1305207110

    :param url: (str)
    :return: doi (str) or None
    """
    out = '10.1073/pnas.'
    match = re_pnas_supplement.match(url)
    if match:
        doi_suffix = match.groupdict()['ident'].split('.')[0]
        return out + doi_suffix
    return None




[docs]
def get_elifesciences_doi_from_link(url):
    """ eLIFE / http://elifesciences.org

    Examples:
        * http://elifesciences.org/content/5/e12203 --> 10.7554/eLife.12203
        * http://elifesciences.org/content/4/e11205 --> 10.7554/eLife.11205
        * http://elifesciences.org/content/4/e11205-download.pdf
        * http://cdn.elifesciences.org/elife-articles/11205/figures-pdf/elife11205-figures.pdf?xxxx

    :param url: (str)
    :return: doi (str) or None
    """
    if 'elifesciences.org' not in url:
        return None

    out = '10.7554/eLife.'
    patterns = [re_elifesciences,
                re_elifesciences_figures]

    for pattern in patterns:
        match = pattern.match(url)
        if match:
            doi_suffix = match.groupdict()['ident']
            return out + doi_suffix

    return None




[docs]
def get_bmj_doi_from_link(url):
    """ BMJ and subsidiaries use a VIP-ish format that can *sometimes* be mapped to their real
    DOIs. In the case that this process fails, use of the VIP->citation routines should work.

    List of BMJ Journals: http://journals.bmj.com/

    Examples:
        http://jmg.bmj.com/content/39/6/e31.full --> 10.1136/jmg.39.6.e31
        http://www.bmj.com/content/353/bmj.i2195 --> 10.1136/bmj.i2195
        http://www.bmj.com/content/353/bmj.i2139 --> 10.1136/bmj.i2139

    Returns None (should be caught by find_doi_in_string):
        http://bmjopengastro.bmj.com/doi/full/10.1136/bmjgast-2015-000075 --> 10.1136/bmjgast-2015-000075

    Returns None (must use VIP->citation routines):
        http://gut.bmj.com/content/65/5/767.abstract --> 10.1136/gutjnl-2015-311246

    :param url: (str)
    :return: doi (str) or None
    """
    
    if 'bmj.com' not in url:
        return None

    out = '10.1136/'
    doi = None

    BMJ_VIP_TO_DOI_DOMAINS = ['jmg']
    match = re_bmj_vip_to_doi.match(url)
    if match:
        parts = match.groupdict()
        if parts['subdomain'] in BMJ_VIP_TO_DOI_DOMAINS:
            doi = out + '{subdomain}.{volume}.{issue}.{first_page}'.format(**parts)

    else:
        match = re_bmj.match(url)
        if match:
            parts = match.groupdict()
            doi = out + parts['doi_suffix']

    # gotta test that doi. it might be a dud.
    if doi:
        try:
            DXDOI().resolve(doi)
            return doi
        except (BadDOI, DxDOIError):
            return None
    return None




[docs]
def get_spandidos_doi_from_link(url):
    """ Spandidos urls follow several different conventions and their website seems to be undergoing
    some changes recently. For now, let's just scrape the page for the first available DOI.

    Examples:
        http://www.spandidos-publications.com/or/30/2/553 --> 10.3892/or.2013.2535
        http://www.spandidos-publications.com/10.3892/or.2016.4700 --> 10.3892/or.2013.2535
        http://www.spandidos-publications.com/10.3892/or.2013.2535/abstract --> 10.3892/or.2013.2535

    :param url: (str)
    :return: doi (str) or None
    """
    if 'spandidos-publications.com' not in url:
        return None

    url = url.replace('download', 'abstract')
    return scrape_doi_from_article_page(url)




[docs]
def get_karger_doi_from_link(url):
    """ Karger IDs can be found in the URL after the "PDF" or "Abstract" piece, and used to
    compose a DOI by prepending enough zeroes to make a 9-digit number. The Karger publisher
    ID is 10.1159

    e.g.
       http://www.karger.com/Article/Abstract/329047 --> 10.1159/000329047
       http://www.karger.com/Article/Abstract/83388 --> 10.1159/000083388

    :param url: (str)
    :return: doi (str) or None
    """
    out = '10.1159/'
    match = re_karger.match(url)
    if match:
        kid = match.groupdict()['kid']
        num_zeroes_needed = 9 - len(kid)
        return out + '0'*num_zeroes_needed + kid
    else:
        return None




[docs]
def get_jstage_doi_from_link(url):
    """ Since the jstage urls are composed with some degree of unpredictability with respect to
    what's found in segment that ought to contain the first_page element, we have to load the _article
    page (if we can) and try to extract the DOI.

    :param url: (str)
    :return: doi or None
    """
    match = re_jstage.match(url)
    if match:
        if url.find('_pdf') > -1:
            url = url.replace('_pdf', '_article')
        return scrape_doi_from_article_page(url)




[docs]
def get_sciencedirect_doi_from_link(url):
    """ We can extract the PII from most sciencedirect links. To get a DOI, we may be able to
    simply append the PII to the publisher code "10.1016/", or we may have to inject the special
    character separaters into the PII numbers.

    Example:
        http://www.sciencedirect.com/science/article/pii/S0094576599000673

        PII = S0094576599000673
        DOI = 10.1016/S0094-5765(99)00067-3

    :param url: (str)
    :return: doi or None
    """
    if 'sciencedirect.com' not in url:
        return None

    out = '10.1016/'

    try:
        pii = re_sciencedirect_pii_simple.match(url).groupdict()['pii']
        pii = OFFICIAL_PII_FORMAT.format(pt1=pii[:5], pt2=pii[5:9], pt3=pii[9:11], pt4=pii[11:16], pt5=pii[16])
    except AttributeError:
        try:
            pii = re_sciencedirect_pii_official.match(url).groupdict()['pii']
        except AttributeError:
            return None
    doi = out + pii
    try:
        DXDOI().resolve(doi)
        return doi
    except DxDOIError:
        # some
        pass

    # use URL scrape
    return scrape_doi_from_article_page('http://www.sciencedirect.com/science/article/pii/%s' % pii)




[docs]
def get_cell_doi_from_link(url):
    """ Cell and ScienceDirect links have similar properties, but there are several different url
    types for Cell abstracts and PDFs (much like biomedcentral).

    Examples:
        http://www.cell.com/pdf/0092867480906212.pdf --> 10.1016/0092-8674(80)90621-2
        http://www.cell.com/cancer-cell/pdf/S1535610806002844.pdf --> 10.1016/j.ccr.2006.09.010
        http://www.cell.com/molecular-cell/abstract/S1097-2765(00)80321-4 --> 10.1016/S1097-2765(00)80321-4
        http://www.cell.com/current-biology/fulltext/S0960-9822%2816%2930170-1 --> 10.1016/j.cub.2016.03.002
        http://www.cell.com/cell-reports/pdfExtended/S2211-1247(15)01030-X --> 10.1016/j.celrep.2015.09.019
        http://www.cell.com/ajhg/pdfExtended/S0002-9297(16)30051-9 --> 10.1016/j.ajhg.2016.03.016
        http://www.cell.com/ajhg/pdf/S0002-9297(16)00050-1.pdf --> 10.1016/j.ajhg.2016.03.016

    Unsolved cases:
        http://www.cell.com/cms/attachment/2020150130/2039963519/mmc1.pdf --> 10.1016/j.neuron.2014.09.027
        http://www.cell.com/cms/attachment/2024895080/2044576473/mmc1.pdf --> 10.1016/j.ajhg.2009.01.009
        http://www.cell.com/cms/attachment/2030360419/2047969851/mmc1.xlsx --> ?
        http://www.cell.com/cms/attachment/2030360419/2047969852/mmc2.xlsx --> ?

    :param url: (str)
    :return: doi or None
    """
    if 'cell.com' not in url:
        return None

    out = '10.1016/'
    pii = ''

    # Try "official" pii format first
    match = re_cell_pii_official.match(url)
    if match:
        pii = match.groupdict()['pii']

    else:
        # Try "simple" (no punctuation) pii formats.
        match = re_cell_pii_simple.match(url)
        if match:
            pii = match.groupdict()['pii']
            pii = OFFICIAL_PII_FORMAT.format(pt1=pii[:5], pt2=pii[5:9], pt3=pii[9:11], pt4=pii[11:16], pt5=pii[16])

        else:
            # Try "old style" (has no "S" in front).
            match = re_cell_old_style.match(url)
            if match:
                pii = match.groupdict()['pii']
                pii = OFFICIAL_PII_FORMAT.format(pt1=pii[:4], pt2=pii[4:8], pt3=pii[8:10], pt4=pii[10:15], pt5=pii[15])

    if match:
        journal_abbrev = match.groupdict().get('journal_abbrev', None)
        if journal_abbrev and journal_abbrev in ['cancer-cell', 'current-biology', 'cell-reports', 'ajhg']:
            url = url.replace('pdfExtended', 'abstract')
            url = url.replace('/pdf/', '/abstract/')
            url = url.replace('.pdf', '')
            return scrape_doi_from_article_page(url)

        return out + pii

    return None



# TODO: nature function needs improvement (Older articles, mostly).

[docs]
def get_nature_doi_from_link(link):
    """ Custom method to get a DOI from a nature.com URL

    Examples:
        http://www.nature.com/modpathol/journal/vaop/ncurrent/extref/modpathol2014160x3.xlsx -->
        http://www.nature.com/onc/journal/v26/n57/full/1210594a.html --> 10.1038/sj.onc.1210594
        http://www.nature.com/pr/journal/v79/n5/full/pr201635a.html --> 10.1038/pr.2016.35

    Older articles may have very different DOIs, so at the tail end of this process we do a lookup
    in dx.doi.org.  If the DOI is invalid, we should use scrape_doi_from_article_page and return
    that instead.

    Example of older-style DOI from Pediatric Research journal ('pr'):
        http://www.nature.com/pr/journal/v49/n1/full/pr20018a.html --> 10.1203/00006450-200101000-00008

    :param link: the URL
    :return: a string containing a DOI, if one was resolved, or None
    """
    # TODO: check validity of DOI before returning.
    # Some older articles need to have their pages loaded and doi scraped.
    # example: http://www.nature.com/pr/journal/v49/n1/full/pr20018a.html --> 10.1203/00006450-200101000-00008

    if 'nature.com' not in link:
        return None

    # this is a non-comprehensive list of nature journals
    style1journals = ['gimo', 'nature', 'nbt', 'ncb', 'nchembio', 'ncomms', 'ng', 'nm', 'nn',
                      'nrc', 'nrm', 'nsmb', 'srep']

    # example: link:http://www.nature.com/modpathol/journal/vaop/ncurrent/extref/modpathol2014160x3.xlsx
    #          doi:10.1038/modpathol.2014.160
    style2journals = ['aps', 'bjc', 'cddis', 'cr', 'ejhg', 'gim', 'jcbfm', 'jhg', 'jid', 'labinvest', 'leu',
                      'modpathol', 'mp', 'onc', 'oncsis', 'pr']


    match = re.search(r'nature.com/[a-zA-z]+/', link)

    if match:
        try:
            journal_abbrev = match.group(0).split('/')[1]
        except:
            print('Warning: Unable to extract journal abbrev from link {}'.format(link))
            journal_abbrev = None

    # Example: http://www.nature.com/neuro/journal/v13/n11/abs/nn.2662.html
    if journal_abbrev == 'neuro':
        journal_abbrev = 'nn'

    match = re.search(r'%s\.{0,1}\d+' % journal_abbrev, link)
    if match:
        doi_suffix = match.group(0)
        if doi_suffix.endswith('.'):  # strip off a trailing period
            doi_suffix = doi_suffix[:-1]

        # the DOI suffix can be taken directly for these journals
        if journal_abbrev in style1journals:
            return '10.1038/{}'.format(doi_suffix)

        # style2journals are the default
        else:
            year = doi_suffix[len(journal_abbrev):len(journal_abbrev)+4]
            num = doi_suffix[len(journal_abbrev)+4:]
            return '10.1038/{}.{}.{}'.format(journal_abbrev, year, num)

    # http://www.nature.com/articles/cr2009141 :
    # http://www.nature.com/articles/cddis201475
    # http://www.nature.com/articles/nature03404
    # http://www.nature.com/articles/ng.2223
    # http://www.nature.com/articles/nsmb.2666
    match = re.search(r'articles/(([a-z]+)\.{0,1}(\d+))', link)
    if match:
        full_match = match.group(0)
        suffix = match.group(1)
        journal_abbrev = match.group(2)
        num = match.group(3)
        if journal_abbrev in style1journals:
            return '10.1038/{}'.format(suffix)
        else:
            return '10.1038/{}.{}.{}'.format(journal_abbrev, num[:4], num[4:])

    # http://www.nature.com/leu/journal/v19/n11/abs/2403943a.html : 10.1038/sj.leu.2403943
    # http://www.nature.com/onc/journal/v26/n57/full/1210594a.html :  doi:10.1038/sj.onc.1210594
    match = re.search(r'full/\d+|abs/\d+', link)
    if match:
        num = match.group(0).split('/')[1]
        return '10.1038/sj.{}.{}'.format(journal_abbrev, num)

    # nothing? try scraping the page.

    link = link.replace('.pdf', '.html')
    return scrape_doi_from_article_page(link)




[docs]
def get_biomedcentral_doi_from_link(link):
    """ Custom method to get a DOI from a biomedcentral.com URL

    :param link: (str) the URL
    :return: doi (str) or None
    """
    # style 1:
    # http://www.biomedcentral.com/content/pdf/bcr1282.pdf : doi:10.1186/bcr1282
    # http://www.biomedcentral.com/content/pdf/1465-9921-12-49.pdf : doi:10.1186/1465-9921-12-49
    # http://www.biomedcentral.com/content/pdf/1471-2164-16-S1-S3.pdf : doi:10.1186/1471-2164-16-S1-S3
    # http://www.biomedcentral.com/content/pdf/1753-6561-4-s2-o22.pdf : doi:10.1186/1753-6561-4-S2-O22
    # http://genomebiology.com/content/pdf/gb-2013-14-10-r108.pdf : doi:10.1186/gb-2013-14-10-r108
    # for supplementary, must remove the last 'S' part
    # http://www.biomedcentral.com/content/supplementary/bcr1865-S3.doc : doi:10.1186/bcr1865
    # http://www.biomedcentral.com/content/supplementary/bcr3584-S1.pdf : doi:10.1186/bcr3584
    # http://www.biomedcentral.com/content/supplementary/1471-2105-11-300-S1.PDF : doi:10.1186/1471-2105-11-300
    # http://www.biomedcentral.com/content/supplementary/1471-2164-12-343-S3.XLS : doi:10.1186/1471-2164-12-343
    # http://www.biomedcentral.com/content/supplementary/1471-2164-14-S3-S7-S1.xlsx : doi:10.1186/1471-2164-14-S3-S7
    # http://www.biomedcentral.com/content/supplementary/gb-2013-14-10-r108-S8.xlsx : doi:10.1186/gb-2013-14-10-r108
    # style 2:
    # http://www.biomedcentral.com/1471-2148/12/114 : doi:10.1186/1471-2164-12-114
    # http://www.biomedcentral.com/1471-2164/15/707/table/T2 : doi:10.1186/1471-2164-15-707
    # http://www.biomedcentral.com/1471-2164/14/S1/S11 doi:10.1186/1471-2164-14-S1-S11
    # http://www.biomedcentral.com/1471-230X/11/31 doi:10.1186/1471-230X-11-31

    if 'biomedcentral.com' not in link:
        return None

    # first, try to use the filename
    if '/content/' in link:
        filename = link.split('/')[-1]
        if '.' in filename:
            base = filename.split('.')[0]
            if '/pdf/' in link:
                return '10.1186/' + base
            elif '/supplementary/' in link:
                i1 = base.rfind('S')
                i2 = base.rfind('s')
                i = max(i1, i2)
                return '10.1186/' + base[:i-1]
    else:
        parse_result = urlparse(link)
        path = parse_result.path
        keywords = ['abstract', 'figure', 'table']
        for kw in keywords:
            if kw in path:
                i = path.find(kw)
                path = path[:i-1]
                break
        if path[-1] == '/':
            path = path[:-1]
        if path[0] == '/':
            path = path[1:]
        return '10.1186/' + path.replace('/', '-')




[docs]
def get_jci_doi_from_link(url):
    """ Journal of Clinical Investigation (JCI) links have a numerical ID that can be used to
    reconstruct the article's DOI.

    Example:
        http://www.jci.org/articles/view/32496 --> 10.1172/JCI32496
        http://www.jci.org/articles/view/8154/version/1/pdf/render --> 10.1172/JCI8154

    :param url: (str)
    :return: doi or None
    """
    out = '10.1172/JCI'
    match = re_jci.match(url)
    if match:
        return out + match.groupdict()['jci_id']
    else:
        return None




[docs]
def get_ahajournals_doi_from_link(url):
    """ If this is an ahajournals.org journal, we might be able to compose a DOI using the publisher base
    of 10.1161 and pieces of the URL identifying the article.

    Example:
        http://circimaging.ahajournals.org/content/suppl/2013/04/02/CIRCIMAGING.112.000333.DC1/000333_Supplemental_Material.pdf
                --> 10.1161/CIRCIMAGING.112.000333
        http://jaha.ahajournals.org/content/4/12/e002395.full.pdf --> 10.1161/JAHA.115.002395 

    :param url: (str)
    :return: doi or None
    """
    if 'ahajournals.org' not in url:
        return None

    out = '10.1161/'
    match = re_ahajournals.match(url)
    if match:
        return out + match.groupdict()['doi_suffix']

    url = url.replace('.pdf', '')
    return scrape_doi_from_article_page(url)




[docs]
def get_early_release_doi_from_link(url):
    """
    Examples:
        http://cancerres.aacrjournals.org/content/early/2015/12/30/0008-5472.CAN-15-0295.full.pdf --> 10.1158/0008-5472.CAN-15-0295
        http://ajcn.nutrition.org/content/early/2016/04/20/ajcn.115.123752.abstract --> 10.3945/ajcn.115.123752
        http://www.mcponline.org/content/early/2016/04/25/mcp.O115.055467.full.pdf+html --> 10.1074/mcp.O115.055467
        http://nar.oxfordjournals.org/content/early/2013/11/21/nar.gkt1163.full.pdf --> 10.1093/nar/gkt1163
        http://jmg.bmj.com/content/early/2008/07/08/jmg.2008.058297 --> 10.1136/jmg.2008.058297

    :param url: (str)
    :return: doi or None
    """

    match = re_early_release.match(url)
    if match:
        resd = match.groupdict()
        hostname = hostname_of(resd['hostname'])
        root_domain = rootdomain_of(hostname)

        # special treatment for oxfordjournals.org
        if root_domain in 'oxfordjournals.org':
            doi_pt1, doi_pt2 = resd['doi_suffix'].split('.', 2)
            doi_suffix = '%s/%s' % (doi_pt1, doi_pt2)
            return HOSTNAME_TO_DOI_PREFIX_MAP['*.oxfordjournals.org'] + '/' + doi_suffix

        if hostname in HOSTNAME_TO_DOI_PREFIX_MAP.keys():
            return HOSTNAME_TO_DOI_PREFIX_MAP[hostname] + '/' + resd['doi_suffix']

        elif '*.%s' % root_domain in HOSTNAME_TO_DOI_PREFIX_MAP.keys():
            # create a "wildcard" subdomain lookup in case that's an option in the hostname-doi map.
            return HOSTNAME_TO_DOI_PREFIX_MAP['*.%s' % root_domain] + '/' + resd['doi_suffix']




[docs]
def get_generic_doi_from_link(url):
    """ Covers many publisher URLs such as wiley and springer.

    Examples:
        http://onlinelibrary.wiley.com/doi/10.1111/j.1582-4934.2011.01476.x/full --> 10.1111/j.1582-4934.2011.01476.x
        link.springer.com/article/10.1186/1471-2164-7-243 --> 10.1186/1471-2164-7-243
        http://link.springer.com/article/10.1007/s004399900122 --> 10.1007/s004399900122

    :param url: (str)
    :return: doi or None
    """
    doi = find_doi_in_string(url)
    if doi:
        # remove common addenda that may have come from the regular expression.
        for addendum in ['/full', '/asset', '/pdf', '.pdf']:
            place = doi.find(addendum)
            if place > -1:
               doi = doi[:place]

    # we had better check ourselves before we wreck ourselves.
    try:
        DXDOI().resolve(doi)
        return doi
    except (BadDOI, DxDOIError):
        return None




[docs]
def get_plos_doi_from_link(url):
    """ PLOS one (almost?) always has the DOI in the link, with a twist -- some of 
    the links we run across are DOIs pointing straight to article supplements.

    For example:

        Supplement doi: 10.1371/journal.pone.0094554.s002
        Article doi: 10.1371/journal.pone.0094554

    Since we always want the article DOI for PMID gathering purposes, the DOI 
    returned from this function should be the one pointing to the parent article.

    Examples:
        http://journals.plos.org/plosone/article?id=10.1371%2Fjournal.pone.0154075 --> 10.1371/journal.pone.0154075
        http://journals.plos.org/plosone/article?id=info%3Adoi%2F10.1371%2Fjournal.pone.0153994 --> 10.1371/journal.pone.0153994
        http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0152441#pone-0152441-t002 --> 10.1371/journal.pone.0152441
        http://journals.plos.org/plosone/article/asset?unique&id=info:doi/10.1371/journal.pone.0094554.s002 --> 10.1371/journal.pone.0094554

    :param url: (str)
    :return: doi (str) or None
    """
    if 'plos.org' not in url:
        return None

    doi = find_doi_in_string(url)
    if doi:
        if '#' in doi:
            doi = doi[:doi.find('#')]

        parts = doi.split('.')
        return '.'.join(parts[:4])
    return None



# == DOI search method registry... order matters! don't screw around with it unless you know what you're doing. :) == #
#
# Comments to the right of each method denote which "expensive" operations they use:
#       * "scrape": loading the page to read text off it (usually to get the DOI)
#       * "DxDOI": loading the DOI in dx.doi.org to verify that it is a real DOI.
#
# Some URLs can be reversed to DOIs using 2 or 3 different methods. We're trying to use the least "expensive"
# method that gets the job done, while making sure to avoid false positives.

DOI_METHODS = [get_elifesciences_doi_from_link,
               get_plos_doi_from_link,
               get_early_release_doi_from_link,
               get_cell_doi_from_link,
               get_jci_doi_from_link,
               get_jstage_doi_from_link,            # uses scrape (1st)
               get_pnas_doi_from_link,
               get_bmj_doi_from_link,
               get_ahajournals_doi_from_link,       # uses scrape (2nd)
               get_biomedcentral_doi_from_link,
               get_nature_doi_from_link,
               get_sciencedirect_doi_from_link,     # uses scrape (last) and DxDOI
               get_karger_doi_from_link,
               get_spandidos_doi_from_link,         # uses scrape (1st)
               get_generic_doi_from_link,           # uses DxDOI 
               ]



[docs]
def try_doi_methods(url):
    """ Tries every "get_*_doi_from_link" method registered in DOI_METHODS and returns a doi
    when/if it finds one. As a last resort, uses find_doi_in_string(url), which may work in cases
    where the DOI can be parsed directly out of the URL.

    :param url: (str)
    :return: {'doi': <doi>, 'method': <method>} or None
    """
    for method in DOI_METHODS:
        doi = method(url)
        if doi:
            return {'doi': doi, 'method': method}
    return None




[docs]
def try_vip_methods(url):
    """ Many URLs follow the "volume-issue-page" format. If this URL is one of them, this function will return
    a dictionary containing at least the volume, issue, and first_page aspects of this article. The 'jtitle'
    key may or may not be filled in depending on whether metapub is aware of this journal's domain name.

    See metapub/urlreverse/hostname2journal.py for the list of supported journals (and please consider
    contributing to the list if you can).

    :param url: (str)
    :return: dict or None
    """
    match = re_vip.match(url)

    if match:
        jrnl = get_journal_name_from_url(url)
        vipdict = match.groupdict()
        vipdict.update({'format': 'vip', 'jtitle': jrnl})
        return vipdict

    return None




[docs]
def try_pmid_methods(url):
    """ Attempts to get the PMID directly out of the URL.

    Examples:
        https://www.ncbi.nlm.nih.gov/pubmed/22253870 --> 22253870
        http://aac.asm.org/cgi/pmidlookup?view=long&pmid=7689822 --> 7689822

    :param url: (str)
    :return: pmid or None
    """
    match = re_pmidlookup.match(url)
    if match:
        return match.groupdict()['pmid']

    match = re_pubmed_pmid.match(url)
    if match:
        return match.groupdict()['pmid']