# -*- coding: utf-8 -*-
import datetime
import logging
from habanero import Crossref as CREngine
import Levenshtein
from .base import Borg
from .config import DEFAULT_EMAIL
from . import cite
from .ncbi_errors import NCBIServiceError
log = logging.getLogger('metapub.crossref')
# for use with Levenshtein distance string comparison of titles in CR results.
TITLE_SIMILARITY_IDEAL_SCORE = .95 # automatically accept results over .95 threshold.
TITLE_SIMILARITY_MIN_SCORE = .8 # throw out results that fall below .8 threshold.
[docs]
def get_most_similar_work_from_crossref_results(qstring, qname, cr_results):
"""Uses Levenshtein distance on result title to rank CrossRef results.
Returns top candidate for a match from these items based on comparison title.
:param qstring: (str) original query string for search
:param qname: (str) name of query item (e.g. "title")
:param cr_results: (dict) crossref results as returned by habanero
:return: {'title_ld': <score>, 'work': <CrossRefWork or None>}
"""
bestcandidate = { qname+'_ld': 0, 'work': None, }
if cr_results['message']['total-results'] > 0:
for item in cr_results['message']['items']:
this = item[qname][0].lower()
this_ld = Levenshtein.ratio(qstring.lower(), this.lower())
if this_ld > bestcandidate[qname+'_ld']:
bestcandidate = { qname+'_ld': this_ld, 'work': item, }
return bestcandidate
[docs]
class CrossRefWork(object):
"Represents one 'work' from CrossRef search results."
[docs]
def __init__(self, **kwargs):
self.doi = kwargs.get('DOI', None)
self.url = kwargs.get('URL', None)
self.author = kwargs.get('author', None)
self.indexed = kwargs.get('indexed', None)
self.reference_count = kwargs.get('reference-count', None)
self.publisher = kwargs.get('publisher', None)
self.issue = kwargs.get('issue', None)
self.funder = kwargs.get('funder', None)
self.content_domain = kwargs.get('content-domain', None)
self.published_print = kwargs.get('published-print', None)
self.type = kwargs.get('type', None)
self.created = kwargs.get('created', None)
self.page = kwargs.get('page', None)
self.source = kwargs.get('source', None)
self.is_referenced_by_count = kwargs.get('is-referenced-by-count', None)
self.title = kwargs.get('title', None)
self.prefix = kwargs.get('prefix', None)
self.volume = kwargs.get('volume', None)
self.member = kwargs.get('member', None)
self.published_online = kwargs.get('published-online', None)
self.reference = kwargs.get('reference', None)
self.container_title = kwargs.get('container-title', None)
self.language = kwargs.get('language', None)
self.link = kwargs.get('link', None)
self.deposited = kwargs.get('deposited', None)
self.score = kwargs.get('score', None)
self.editor = kwargs.get('editor', None)
self.issued = kwargs.get('issued', None)
self.references_count = kwargs.get('references-count', None)
self.journal_issue = kwargs.get('journal-issue', None)
self.relation = kwargs.get('relation', None)
self.ISSN = kwargs.get('ISSN', None)
self.issn_type = kwargs.get('issn-type', None)
@property
def first_page(self):
"""Returns first page (number) of article as string, or None if self.page is empty."""
if self.page:
return self.page.split('-')[0]
return None
@property
def citation(self):
"""Returns a formal citation string for this work."""
return cite.article(**self.to_citation())
@property
def pubyear(self):
if self.issued:
return self.issued['date-parts'][0][0]
return ''
@property
def pubmonth(self):
if self.issued:
return self.issued['date-parts'][0][1]
@property
def pubdate(self):
if self.issued:
return datetime.date(self.issued['date-parts'][0])
@staticmethod
def _format_author_first_last(auth):
"""Format an author dict as 'Firstname Lastname', handling missing fields."""
given = auth.get('given', '')
family = auth.get('family', '')
name = auth.get('name', '')
if given and family:
return given + ' ' + family
if family:
return family
return name
@staticmethod
def _format_author_last_fm(auth):
"""Format an author dict as 'Lastname F', handling missing fields."""
given = auth.get('given', '')
family = auth.get('family', '')
name = auth.get('name', '')
if family and given:
return family + ' ' + given[0].upper()
if family:
return family
return name
@property
def author1(self):
if not self.author:
return ''
for auth in self.author:
if auth.get('sequence') == 'first':
return self._format_author_first_last(auth)
return ''
@property
def author1_last_fm(self):
if not self.author:
return ''
for auth in self.author:
if auth.get('sequence') == 'first':
return self._format_author_last_fm(auth)
return ''
@property
def authors_str_lastfirst(self):
"""Returns this work's authors as a semicolon-separated string -- LASTNAME FIRSTInitial."""
if not self.author:
return ''
out = self.author1_last_fm
if len(self.author) > 1:
for auth in self.author[1:]:
out += ';' + self._format_author_last_fm(auth)
return out
@property
def author_list(self):
"""Returns this work's authors as a flat list (Firstname Lastname), retaining order given by Crossref."""
if not self.author:
return []
return [self._format_author_first_last(auth) for auth in self.author]
@property
def author_list_last_fm(self):
"""Returns this work's authors as a flat list (Lastname FirstInitial), retaining order given by Crossref."""
if not self.author:
return []
return [self._format_author_last_fm(auth) for auth in self.author]
[docs]
def to_citation(self):
"""Describes this work as a dictionary suitable for citation lookups in PubMed."""
author1 = self.author1
aulast = author1.split()[-1] if author1 else ''
return {'journal': self.container_title[0] if self.container_title else '',
'year': self.pubyear,
'title': self.title[0] if self.title else '',
'authors': self.author_list_last_fm,
'doi': self.doi,
'volume': self.volume,
'issue': self.issue,
'pages': self.page,
'first_page': self.first_page,
'aulast': aulast,
}
[docs]
def to_dict(self):
"Describes this Work as a dictionary similar to the one returned by CrossRef."
outd = self.__dict__.copy()
outd['references-count'] = outd.pop('references_count')
outd['issn-type'] = outd.pop('issn_type')
outd['journal-issue'] = outd.pop('journal_issue')
outd['container-title'] = outd.pop('container_title')
outd['published-print'] = outd.pop('published_print')
outd['is_referenced-by-count'] = outd.pop('is_referenced_by_count')
outd['published-online'] = outd.pop('published_online')
outd['content-domain'] = outd.pop('content_domain')
return outd
def __str__(self):
return """<CrossRefWork {doi} Score: {score}> {aulast}. "{title}" {journal}. {year}. {volume}({issue}):{pages}\n\t""".format(score=self.score, **self.to_citation())
def __repr__(self):
return "<CrossRefWork DOI: {doi}>".format(doi=self.doi)
[docs]
class CrossRefFetcher(Borg):
"""Valid field queries for this route are: affiliation, degree, event-acronym, bibliographic, container-title, publisher-name, author, event-theme, standards-body-acronym, chair, event-location, translator, funder-name, event-name, publisher-location, title, standards-body-name, contributor, editor, event-sponsor"""
[docs]
def __init__(self, **kwargs):
self.cr = CREngine(mailto=kwargs.get('email', DEFAULT_EMAIL))
self.log = logging.getLogger('metapub.crossref.CrossRefFetcher')
[docs]
def article_by_doi(self, doi):
"""Returns a CrossRefWork object loaded by querying the Crossref works/DOI REST endpoint.
:param doi: (str)
:rtype: CrossRefWork
:raises: HTTPError (404) if DOI not found.
:raises: Exception for network/service issues
"""
try:
res = self.cr.works(doi)
return CrossRefWork(**res['message'])
except Exception as e:
# Add context for CrossRef errors but don't use NCBI-specific handling
if any(keyword in str(e).lower() for keyword in [
'connection', 'timeout', 'network', 'service unavailable'
]):
raise Exception(
f"Unable to fetch CrossRef data for DOI '{doi}' due to network/service issues. "
f"Check your internet connection and try again. Original error: {str(e)}"
) from e
else:
raise
[docs]
def article_by_pma(self, pma, ideal_ld=TITLE_SIMILARITY_IDEAL_SCORE,
min_ld=TITLE_SIMILARITY_MIN_SCORE):
"""From a PubMedArticle object, use as much info as needed to get as precise
a match on CrossRef as is possible.
1st attempt: Title + Journal. Runs Levenshtein distance on results; if any results have
a better similarity ratio than ideal_ld, the top of these results will
be returned. Otherwise, the first item with a score better than min_ld
will be kept and compared against 1nd attempt results.
2nd attempt: Title + First Author. Same process as 1st attempt but with any candidates
found in 1st attempt submitted for comparison.
Finally: Return None or CrossRefWork from best candidate that exceeds min_ld requirement.
:param pma: PubMedArticle object
:param ideal_ld: (float) [default: set in global at top of crossref.py]
:param min_ld: (float) [default: set in global at top of crossref.py]
:rtype: CrossRefWork
"""
# Try with Title and Journal only
res = self.cr.works(query_bibliographic=pma.title, query_container_title=pma.journal, limit=5)
self.log.debug('PMID %s: Crossref Title/Journal query got %i results', pma.pmid, res['message']['total-results'])
bestcandidate = get_most_similar_work_from_crossref_results(pma.title, 'title', res)
# if we have a real winner (exceeds ideal Lev. ratio), let's just take this one.
if bestcandidate['title_ld'] > ideal_ld:
self.log.debug('PMID %s: Best candidate had Levanshtein title similarity %f', pma.pmid, bestcandidate['title_ld'])
return CrossRefWork(**bestcandidate['work'])
# If our only candidate is an empty work, we're just hosed, so cut it here.
if bestcandidate['work'] is None:
self.log.debug('PMID %s: No results found in CrossRef.', pma.pmid)
return None
# Insufficient results, try different combo of details.
# Run our last candidate (if we got one) in the next pageont.
self.log.debug('PMID %s: OK candidate with title_ld %f (title: %s)', pma.pmid, bestcandidate['title_ld'], bestcandidate['work']['title'][0])
# Try with Title and Author
res = self.cr.works(query_bibliographic=pma.title, query_author=pma.author1_lastfm, limit=5)
self.log.debug('PMID %s: Crossref Title/Author query got %i results', pma.pmid, res['message']['total-results'])
if res['message']['total-results'] > 0:
thiscandidate = get_most_similar_work_from_crossref_results(pma.title, 'title', res)
if thiscandidate['title_ld'] > bestcandidate['title_ld']:
self.log.debug('PMID %s: Better candidate with title_ld %f (title: %s)', pma.pmid, bestcandidate['title_ld'], bestcandidate['work']['title'][0])
bestcandidate = thiscandidate
if bestcandidate['title_ld'] > min_ld:
self.log.debug('PMID %s: Best candidate with title_ld %f > %f (min_ld) (title: %s)',
pma.pmid, bestcandidate['title_ld'], min_ld, bestcandidate['work']['title'][0])
return CrossRefWork(**bestcandidate['work'])
self.log.debug('PMID %s: No suitable CrossRefWork found.', pma.pmid)
return None
[docs]
def article_by_title(self, title, **kwargs):
"""Use CrossRef to find a work by its title. Returns first item in the list.
Keywords are passed unmodified to crossref.works() [habanero].
:param title: str
:rtype: CrossRefWork or None (if no results)
"""
res = self.cr.works(query_bibliographic=title, limit=1)
if res['message']['total-results'] > 0:
item = res['message']['items'][0]
return CrossRefWork(**item)
return None