Source code for metapub.urlreverse.methods

import re

from ..dx_doi import DxDOI
from ..exceptions import DxDOIError, BadDOI
from ..text_mining import find_doi_in_string, scrape_doi_from_article_page
from ..utils import hostname_of, rootdomain_of

from .hostname2jrnl import HOSTNAME_TO_JOURNAL_MAP
from .hostname2doiprefix import HOSTNAME_TO_DOI_PREFIX_MAP


# string templates
OFFICIAL_PII_FORMAT = '{pt1}-{pt2}({pt3}){pt4}-{pt5}'


# VIP (volume-issue-page)
re_vip = re.compile(r'(?P<hostname>.*?)\/content(\/\w+)?\/(?P<volume>\d+)\/(?P<issue>\w+)\/(?P<first_page>\w+)', re.I)

# PMID in url
re_pmidlookup = re.compile(r'.*?(\?|&)pmid=(?P<pmid>\d+)', re.I)
re_pubmed_pmid = re.compile(r'.*?ncbi.nlm.nih.gov\/pubmed\/(?P<pmid>\d+)')

# PMCID in url
re_pmcid = re.compile(r'.*?(?P<hostname>ncbi.nlm.nih.gov|europepmc.org)\/.*?(?P<pmcid>PMC\d+)', re.I)

# PII -- see http://en.wikipedia.org/wiki/Publisher_Item_Identifier
pii_official = r'(?P<pii>S\d{4}-\d{4}\(\d{2}\)\d{5}-\w{1})'
re_sciencedirect_pii_simple = re.compile(r'.*?(?P<hostname>sciencedirect\.com)\/science\/article\/pii\/(?P<pii>S\d+\w?)', re.I)
re_sciencedirect_pii_official = re.compile(r'.*?(?P<hostname>sciencedirect\.com)\/science\/article\/pii\/' + pii_official, re.I)
re_cell_pii_simple = re.compile(r'.*?(?P<hostname>cell\.com)\/(?P<journal_abbrev>.*?)\/(pdf|abstract|fulltext|pdfExtended)\/(?P<pii>S\d+)', re.I)
re_cell_pii_official = re.compile(r'.*?cell.com\/((?P<journal_abbrev>.*?)\/)?(pdf|abstract|fulltext|pdfExtended)\/' + pii_official, re.I)
re_cell_old_style = re.compile(r'.*?(?P<hostname>cell\.com)\/(pdf|abstract|fulltext)\/(?P<pii>\d+)', re.I)

# Unique
re_jstage = re.compile(r'.*?(?P<hostname>jstage\.jst\.go\.jp)\/article\/(?P<journal_abbrev>.*?)\/(?P<volume>\d+)\/(?P<issue>.*?)\/(?P<info>).*?\/', re.I)
re_jci = re.compile(r'.*?jci\.org\/articles\/view\/(?P<jci_id>\d+)', re.I)
re_karger = re.compile(r'.*?(?P<hostname>karger\.com)\/Article\/(Abstract|Pdf)\/(?P<kid>\d+)', re.I)
#re_ahajournals = re.compile('\/(?P<doi_suffix>\w+\.\d+\.\d+\.\w+)', re.I)
re_ahajournals = re.compile(r'\/(?P<doi_suffix>[a-z0-9]+\.\d+\.\d+\.[a-z0-9]+)', re.I)
re_elifesciences = re.compile(r'(^|http?:\/\/)elifesciences.org\/content\/(?P<volume>\d+)\/e(?P<ident>\d+)', re.I)
re_elifesciences_figures = re.compile(r'elifesciences\.org\/elife-articles\/(?P<ident>\d+)\/figures-pdf\/', re.I)

re_bmj = re.compile(r'(^|http?:\/\/)(?P<subdomain>\w+)\.bmj.com\/content\/(?P<volume>\d+)\/(?P<doi_suffix>bmj.\w+)', re.I)
re_bmj_vip_to_doi = re.compile(r'(^|http?:\/\/)(?P<subdomain>\w+).bmj.com\/content\/(?P<volume>\d+)\/(?P<issue>\d+)\/(?P<first_page>\w+)', re.I)

# Early release formats
re_early_release = re.compile(r'(^|(http?):\/\/)(?P<hostname>.*?)\/content(\/\w+)?\/early\/(?P<year>\d+)\/(?P<month>\d+)\/(?P<day>\d+)\/(?P<doi_suffix>.*?)(\.full|\.pdf|\.abstract|$)')


# TODO: Common supplement URL format
#re_supplement_common = re.compile()
# http://jmg.bmj.com/content/suppl/2012/05/09/jmedgenet-2012-100892.DC1/Otocephaly_Supplementary_Table_3.pdf
# http://www.pnas.org/content/suppl/2013/07/08/1305207110.DCSupplemental/sapp.pdf
# http://jmg.bmj.com/content/suppl/2015/07/17/jmedgenet-2015-103132.DC1/jmedgenet-2015-103132supp.pdf

re_pnas_supplement = re.compile(r'.*?pnas.org\/content\/suppl\/(?P<year>\d+)\/(?P<month>\d+)\/(?P<day>\d+)\/(?P<ident>.*?)\/', re.I)

# dx.doi.org self-cacheing lookup engine.
DXDOI_INSTANCE = None

[docs] def DXDOI(): global DXDOI_INSTANCE if not DXDOI_INSTANCE: DXDOI_INSTANCE = DxDOI() return DXDOI_INSTANCE
[docs] def get_journal_name_from_url(url): if not url.lower().startswith('http'): url = 'http://' + url hostname = hostname_of(url) if hostname in HOSTNAME_TO_JOURNAL_MAP.keys(): return HOSTNAME_TO_JOURNAL_MAP[hostname] else: return None
# TODO: nature function needs improvement (Older articles, mostly). # == DOI search method registry... order matters! don't screw around with it unless you know what you're doing. :) == # # # Comments to the right of each method denote which "expensive" operations they use: # * "scrape": loading the page to read text off it (usually to get the DOI) # * "DxDOI": loading the DOI in dx.doi.org to verify that it is a real DOI. # # Some URLs can be reversed to DOIs using 2 or 3 different methods. We're trying to use the least "expensive" # method that gets the job done, while making sure to avoid false positives. DOI_METHODS = [get_elifesciences_doi_from_link, get_plos_doi_from_link, get_early_release_doi_from_link, get_cell_doi_from_link, get_jci_doi_from_link, get_jstage_doi_from_link, # uses scrape (1st) get_pnas_doi_from_link, get_bmj_doi_from_link, get_ahajournals_doi_from_link, # uses scrape (2nd) get_biomedcentral_doi_from_link, get_nature_doi_from_link, get_sciencedirect_doi_from_link, # uses scrape (last) and DxDOI get_karger_doi_from_link, get_spandidos_doi_from_link, # uses scrape (1st) get_generic_doi_from_link, # uses DxDOI ]
[docs] def try_doi_methods(url): """ Tries every "get_*_doi_from_link" method registered in DOI_METHODS and returns a doi when/if it finds one. As a last resort, uses find_doi_in_string(url), which may work in cases where the DOI can be parsed directly out of the URL. :param url: (str) :return: {'doi': <doi>, 'method': <method>} or None """ for method in DOI_METHODS: doi = method(url) if doi: return {'doi': doi, 'method': method} return None
[docs] def try_vip_methods(url): """ Many URLs follow the "volume-issue-page" format. If this URL is one of them, this function will return a dictionary containing at least the volume, issue, and first_page aspects of this article. The 'jtitle' key may or may not be filled in depending on whether metapub is aware of this journal's domain name. See metapub/urlreverse/hostname2journal.py for the list of supported journals (and please consider contributing to the list if you can). :param url: (str) :return: dict or None """ match = re_vip.match(url) if match: jrnl = get_journal_name_from_url(url) vipdict = match.groupdict() vipdict.update({'format': 'vip', 'jtitle': jrnl}) return vipdict return None
[docs] def try_pmid_methods(url): """ Attempts to get the PMID directly out of the URL. Examples: https://www.ncbi.nlm.nih.gov/pubmed/22253870 --> 22253870 http://aac.asm.org/cgi/pmidlookup?view=long&pmid=7689822 --> 7689822 :param url: (str) :return: pmid or None """ match = re_pmidlookup.match(url) if match: return match.groupdict()['pmid'] match = re_pubmed_pmid.match(url) if match: return match.groupdict()['pmid']