Source code for metapub.crossref

# -*- coding: utf-8 -*-

import datetime 
import logging

from habanero import Crossref as CREngine
import Levenshtein

from .base import Borg
from .config import DEFAULT_EMAIL
from . import cite
from .ncbi_errors import NCBIServiceError

log = logging.getLogger('metapub.crossref')

# for use with Levenshtein distance string comparison of titles in CR results.
TITLE_SIMILARITY_IDEAL_SCORE = .95   # automatically accept results over .95 threshold.
TITLE_SIMILARITY_MIN_SCORE = .8     # throw out results that fall below .8 threshold.


[docs] def get_most_similar_work_from_crossref_results(qstring, qname, cr_results): """Uses Levenshtein distance on result title to rank CrossRef results. Returns top candidate for a match from these items based on comparison title. :param qstring: (str) original query string for search :param qname: (str) name of query item (e.g. "title") :param cr_results: (dict) crossref results as returned by habanero :return: {'title_ld': <score>, 'work': <CrossRefWork or None>} """ bestcandidate = { qname+'_ld': 0, 'work': None, } if cr_results['message']['total-results'] > 0: for item in cr_results['message']['items']: this = item[qname][0].lower() this_ld = Levenshtein.ratio(qstring.lower(), this.lower()) if this_ld > bestcandidate[qname+'_ld']: bestcandidate = { qname+'_ld': this_ld, 'work': item, } return bestcandidate
[docs] class CrossRefWork(object): "Represents one 'work' from CrossRef search results."
[docs] def __init__(self, **kwargs): self.doi = kwargs.get('DOI', None) self.url = kwargs.get('URL', None) self.author = kwargs.get('author', None) self.indexed = kwargs.get('indexed', None) self.reference_count = kwargs.get('reference-count', None) self.publisher = kwargs.get('publisher', None) self.issue = kwargs.get('issue', None) self.funder = kwargs.get('funder', None) self.content_domain = kwargs.get('content-domain', None) self.published_print = kwargs.get('published-print', None) self.type = kwargs.get('type', None) self.created = kwargs.get('created', None) self.page = kwargs.get('page', None) self.source = kwargs.get('source', None) self.is_referenced_by_count = kwargs.get('is-referenced-by-count', None) self.title = kwargs.get('title', None) self.prefix = kwargs.get('prefix', None) self.volume = kwargs.get('volume', None) self.member = kwargs.get('member', None) self.published_online = kwargs.get('published-online', None) self.reference = kwargs.get('reference', None) self.container_title = kwargs.get('container-title', None) self.language = kwargs.get('language', None) self.link = kwargs.get('link', None) self.deposited = kwargs.get('deposited', None) self.score = kwargs.get('score', None) self.editor = kwargs.get('editor', None) self.issued = kwargs.get('issued', None) self.references_count = kwargs.get('references-count', None) self.journal_issue = kwargs.get('journal-issue', None) self.relation = kwargs.get('relation', None) self.ISSN = kwargs.get('ISSN', None) self.issn_type = kwargs.get('issn-type', None)
@property def first_page(self): """Returns first page (number) of article as string, or None if self.page is empty.""" if self.page: return self.page.split('-')[0] return None @property def citation(self): """Returns a formal citation string for this work.""" return cite.article(**self.to_citation()) @property def pubyear(self): if self.issued: return self.issued['date-parts'][0][0] return '' @property def pubmonth(self): if self.issued: return self.issued['date-parts'][0][1] @property def pubdate(self): if self.issued: return datetime.date(self.issued['date-parts'][0]) @staticmethod def _format_author_first_last(auth): """Format an author dict as 'Firstname Lastname', handling missing fields.""" given = auth.get('given', '') family = auth.get('family', '') name = auth.get('name', '') if given and family: return given + ' ' + family if family: return family return name @staticmethod def _format_author_last_fm(auth): """Format an author dict as 'Lastname F', handling missing fields.""" given = auth.get('given', '') family = auth.get('family', '') name = auth.get('name', '') if family and given: return family + ' ' + given[0].upper() if family: return family return name @property def author1(self): if not self.author: return '' for auth in self.author: if auth.get('sequence') == 'first': return self._format_author_first_last(auth) return '' @property def author1_last_fm(self): if not self.author: return '' for auth in self.author: if auth.get('sequence') == 'first': return self._format_author_last_fm(auth) return '' @property def authors_str_lastfirst(self): """Returns this work's authors as a semicolon-separated string -- LASTNAME FIRSTInitial.""" if not self.author: return '' out = self.author1_last_fm if len(self.author) > 1: for auth in self.author[1:]: out += ';' + self._format_author_last_fm(auth) return out @property def author_list(self): """Returns this work's authors as a flat list (Firstname Lastname), retaining order given by Crossref.""" if not self.author: return [] return [self._format_author_first_last(auth) for auth in self.author] @property def author_list_last_fm(self): """Returns this work's authors as a flat list (Lastname FirstInitial), retaining order given by Crossref.""" if not self.author: return [] return [self._format_author_last_fm(auth) for auth in self.author]
[docs] def to_citation(self): """Describes this work as a dictionary suitable for citation lookups in PubMed.""" author1 = self.author1 aulast = author1.split()[-1] if author1 else '' return {'journal': self.container_title[0] if self.container_title else '', 'year': self.pubyear, 'title': self.title[0] if self.title else '', 'authors': self.author_list_last_fm, 'doi': self.doi, 'volume': self.volume, 'issue': self.issue, 'pages': self.page, 'first_page': self.first_page, 'aulast': aulast, }
[docs] def to_dict(self): "Describes this Work as a dictionary similar to the one returned by CrossRef." outd = self.__dict__.copy() outd['references-count'] = outd.pop('references_count') outd['issn-type'] = outd.pop('issn_type') outd['journal-issue'] = outd.pop('journal_issue') outd['container-title'] = outd.pop('container_title') outd['published-print'] = outd.pop('published_print') outd['is_referenced-by-count'] = outd.pop('is_referenced_by_count') outd['published-online'] = outd.pop('published_online') outd['content-domain'] = outd.pop('content_domain') return outd
def __str__(self): return """<CrossRefWork {doi} Score: {score}> {aulast}. "{title}" {journal}. {year}. {volume}({issue}):{pages}\n\t""".format(score=self.score, **self.to_citation()) def __repr__(self): return "<CrossRefWork DOI: {doi}>".format(doi=self.doi)
[docs] class CrossRefFetcher(Borg): """Valid field queries for this route are: affiliation, degree, event-acronym, bibliographic, container-title, publisher-name, author, event-theme, standards-body-acronym, chair, event-location, translator, funder-name, event-name, publisher-location, title, standards-body-name, contributor, editor, event-sponsor"""
[docs] def __init__(self, **kwargs): self.cr = CREngine(mailto=kwargs.get('email', DEFAULT_EMAIL)) self.log = logging.getLogger('metapub.crossref.CrossRefFetcher')
[docs] def article_by_doi(self, doi): """Returns a CrossRefWork object loaded by querying the Crossref works/DOI REST endpoint. :param doi: (str) :rtype: CrossRefWork :raises: HTTPError (404) if DOI not found. :raises: Exception for network/service issues """ try: res = self.cr.works(doi) return CrossRefWork(**res['message']) except Exception as e: # Add context for CrossRef errors but don't use NCBI-specific handling if any(keyword in str(e).lower() for keyword in [ 'connection', 'timeout', 'network', 'service unavailable' ]): raise Exception( f"Unable to fetch CrossRef data for DOI '{doi}' due to network/service issues. " f"Check your internet connection and try again. Original error: {str(e)}" ) from e else: raise
[docs] def article_by_pma(self, pma, ideal_ld=TITLE_SIMILARITY_IDEAL_SCORE, min_ld=TITLE_SIMILARITY_MIN_SCORE): """From a PubMedArticle object, use as much info as needed to get as precise a match on CrossRef as is possible. 1st attempt: Title + Journal. Runs Levenshtein distance on results; if any results have a better similarity ratio than ideal_ld, the top of these results will be returned. Otherwise, the first item with a score better than min_ld will be kept and compared against 1nd attempt results. 2nd attempt: Title + First Author. Same process as 1st attempt but with any candidates found in 1st attempt submitted for comparison. Finally: Return None or CrossRefWork from best candidate that exceeds min_ld requirement. :param pma: PubMedArticle object :param ideal_ld: (float) [default: set in global at top of crossref.py] :param min_ld: (float) [default: set in global at top of crossref.py] :rtype: CrossRefWork """ # Try with Title and Journal only res = self.cr.works(query_bibliographic=pma.title, query_container_title=pma.journal, limit=5) self.log.debug('PMID %s: Crossref Title/Journal query got %i results', pma.pmid, res['message']['total-results']) bestcandidate = get_most_similar_work_from_crossref_results(pma.title, 'title', res) # if we have a real winner (exceeds ideal Lev. ratio), let's just take this one. if bestcandidate['title_ld'] > ideal_ld: self.log.debug('PMID %s: Best candidate had Levanshtein title similarity %f', pma.pmid, bestcandidate['title_ld']) return CrossRefWork(**bestcandidate['work']) # If our only candidate is an empty work, we're just hosed, so cut it here. if bestcandidate['work'] is None: self.log.debug('PMID %s: No results found in CrossRef.', pma.pmid) return None # Insufficient results, try different combo of details. # Run our last candidate (if we got one) in the next pageont. self.log.debug('PMID %s: OK candidate with title_ld %f (title: %s)', pma.pmid, bestcandidate['title_ld'], bestcandidate['work']['title'][0]) # Try with Title and Author res = self.cr.works(query_bibliographic=pma.title, query_author=pma.author1_lastfm, limit=5) self.log.debug('PMID %s: Crossref Title/Author query got %i results', pma.pmid, res['message']['total-results']) if res['message']['total-results'] > 0: thiscandidate = get_most_similar_work_from_crossref_results(pma.title, 'title', res) if thiscandidate['title_ld'] > bestcandidate['title_ld']: self.log.debug('PMID %s: Better candidate with title_ld %f (title: %s)', pma.pmid, bestcandidate['title_ld'], bestcandidate['work']['title'][0]) bestcandidate = thiscandidate if bestcandidate['title_ld'] > min_ld: self.log.debug('PMID %s: Best candidate with title_ld %f > %f (min_ld) (title: %s)', pma.pmid, bestcandidate['title_ld'], min_ld, bestcandidate['work']['title'][0]) return CrossRefWork(**bestcandidate['work']) self.log.debug('PMID %s: No suitable CrossRefWork found.', pma.pmid) return None
[docs] def article_by_title(self, title, **kwargs): """Use CrossRef to find a work by its title. Returns first item in the list. Keywords are passed unmodified to crossref.works() [habanero]. :param title: str :rtype: CrossRefWork or None (if no results) """ res = self.cr.works(query_bibliographic=title, limit=1) if res['message']['total-results'] > 0: item = res['message']['items'][0] return CrossRefWork(**item) return None