"""metapub.pubmedarticle -- PubMedArticle class instantiated by supplying ncbi XML string."""
import time
from datetime import datetime
from collections import OrderedDict
from .base import MetaPubObject
from .exceptions import MetaPubError
from .text_mining import re_numbers
from .pubmedauthor import PubMedAuthor
from . import cite
[docs]
class PubMedArticle(MetaPubObject):
"""This PubMedArticle class receives an XML string as its required argument
and parses it into its constituent parts, exposing them as attributes.
Usage:
paper = PubMedArticle(xml_string)
To query services to return an article by pmid, use PubMedFetcher, which
returns PubMedArticle objects.
When xmlstr is parsed, the `pubmed_type` attribute will be set to one of 'article' or 'book',
depending on whether PubmedBookArticle or PubmedArticle headings are found in the supplied
xmlstr at instantiation.
Since this class needs to work seamlessly in production whether it's a book
or an article, the PubmedArticle attributes will always be available (set to None in many
cases for PubmedBookArticle, e.g. volume, issue, journal), but PubmedBookArticle
attributes will only be set when pubmed_type='book'.
PubMedBook special handling of certain attributes:
* abstract: a joined string from self.book_abstracts
* title: comes from ArticleTitle
Special attributes for PubmedBookArticle (pubmed_type='book'):
* book_id (default: None) - string from IdType="bookaccession", e.g. "NBK1403"
* book_title (default: None) - string with name of book (as differentiated from ArticleTitle)
* book_publisher (default: None) - dict containing {'name': string, 'location': string}
* book_sections (default: []) - dict with key->value pairs as section_name->SectionTitle
* book_contribution_date (default: None) - python datetime date
* book_date_revised (default: None) - python datetime date
* book_history (default: []) - dictionary with key->value pairs as PubStatus -> python datetime
* book_language (default: None) - string (e.g. "eng")
* book_editors (default: []) - list containing names from 'editors' AuthorList
* book_abstracts (default: []) - dict with key->value pairs as Label->AbstractText.text)
* book_medium (default: None) - string (e.g. "Internet")
* book_synonyms (default: None) - list of disease synonyms (applicable to "gene" book)
* book_publication_status (default: None) - string (e.g. "ppublish")
"""
[docs]
def __init__(self, xmlstr, *args, **kwargs):
"""Initialize PubMedArticle from NCBI XML data.
Args:
xmlstr (str): XML string from NCBI containing PubmedArticle or
PubmedBookArticle data.
*args: Additional positional arguments passed to parent class.
**kwargs: Additional keyword arguments passed to parent class.
Note:
The XML type is automatically detected to handle both regular articles
and book chapters. The `pubmed_type` attribute will be set to 'article'
or 'book' accordingly, and appropriate attributes will be populated.
"""
self.pubmed_type = determine_pubmed_xml_type(xmlstr)
if self.pubmed_type=='book':
self._root = 'BookDocument'
super(PubMedArticle, self).__init__(xmlstr, 'PubmedBookArticle', args, kwargs)
elif self.pubmed_type=='article':
self._root = 'MedlineCitation'
super(PubMedArticle, self).__init__(xmlstr, 'PubmedArticle', args, kwargs)
else:
# assume we're here because of predownloaded Medline XML.
self.pubmed_type = 'article'
self._root = '.'
super(PubMedArticle, self).__init__(xmlstr, None, args, kwargs)
pmt = self.pubmed_type
# shared between book and article types:
self.pmid = self._get_pmid()
self.url = self._get_url()
self.authors = self._get_authors() if pmt == 'article' else self._get_book_authors()
self.author_list = self._get_author_list() if pmt == 'article' else self._get_book_author_list()
self.title = self._get_title() if pmt == 'article' else self._get_book_articletitle()
self.authors_str = self._get_authors_str()
self.author1_last_fm = self._get_author1_last_fm()
self.author1_lastfm = self._get_author1_lastfm()
self.keywords = self._get_keywords()
# 'article' only (not shared):
self.pages = None if pmt == 'book' else self._get_pages()
self.first_page = None if pmt == 'book' else self._get_first_page()
self.last_page = None if pmt == 'book' else self._get_last_page()
self.volume = None if pmt == 'book' else self._get_volume()
self.issue = None if pmt == 'book' else self._get_issue()
self.volume_issue = None if pmt == 'book' else self._get_volume_issue()
self.doi = None if pmt == 'book' else self._get_doi()
self.pii = None if pmt == 'book' else self._get_pii()
self.pmc = None if pmt == 'book' else self._get_pmc()
self.issn = None if pmt == 'book' else self._get_issn()
# MeSH headings ('article' only)
self.mesh = self._get_mesh_headings()
# Chemical associations ('article' only)
self.chemicals = self._get_chemicals()
# Grant information (?? 'article' only ??)
self.grants = self._get_grantlist()
# Publication Types (?? 'article' only ??)
self.publication_types = self._get_publication_types()
# 'book' only:
self.book_accession_id = None if pmt == 'article' else self._get_bookaccession_id()
self.book_title = None if pmt == 'article' else self._get_book_title()
self.book_publisher = None if pmt == 'article' else self._get_book_publisher()
self.book_language = None if pmt == 'article' else self._get_book_language()
self.book_editors = None if pmt == 'article' else self._get_book_editors()
self.book_abstracts = None if pmt == 'article' else self._get_book_abstracts()
self.book_sections = None if pmt == 'article' else self._get_book_sections()
self.book_copyright = None if pmt == 'article' else self._get_book_copyright()
self.book_medium = None if pmt == 'article' else self._get_book_medium()
self.book_synonyms = None if pmt == 'article' else self._get_book_synonyms()
self.book_publication_status = None if pmt == 'article' else self._get_book_publication_status()
self.book_history = None if pmt == 'article' else self._get_book_history()
self.book_contribution_date = None if pmt == 'article' else self._get_book_contribution_date()
self.book_date_revised = None if pmt == 'article' else self._get_book_contribution_date()
# the shared oddballs, must be done last.
self.abstract = self._get_abstract() if pmt == 'article' else self._get_book_abstract()
self.journal = self.book_title if pmt == 'book' else self._get_journal()
self.year = self._get_book_year() if pmt == 'book' else self._get_year()
self.history = self._get_article_history()
[docs]
def to_dict(self):
"""Convert PubMedArticle to dictionary representation.
Returns:
Dict[str, Any]: Dictionary containing all article attributes except
internal XML content and processing attributes.
Note:
Excludes 'content', 'xml', and '_root' attributes from the output
to provide a clean data representation suitable for serialization.
"""
outd = self.__dict__.copy()
outd.pop('content')
outd.pop('xml')
outd.pop('_root')
return self.__dict__
@property
def citation(self):
""" Returns a formatted citation string built from this article's author(s), title,
journal, year, volume, pages, and doi.
Article Example:
McNally EM, et al. Genetic mutations and mechanisms in dilated cardiomyopathy. Journal of Clinical Investigation. 2013; 123:19-26. doi: 10.1172/JCI62862.
Book Example (GeneReviews):
Tranebjarg L, et al. Jervell and Lange-Nielsen syndrome. 2002 Jul 29 (Updated 2014 Nov 20). In: Pagon RA, et al., editors. GeneReviews (Internet). Seattle (WA): University of Washington, Seattle; 1993-2015. Available from: https://www.ncbi.nlm.nih.gov/books/NBK1405/.
"""
#special handling for GeneReviews books
if self.book_accession_id:
return cite.book(self)
return cite.article(**self.to_dict())
@property
def citation_html(self):
""" Returns a formatted citation string built from this article's author(s), title,
journal, year, volume, and pages.
Article Example:
McNally EM, <i>et al</i>. Genetic mutations and mechanisms in dilated cardiomyopathy. <i>Journal of Clinical Investigation</i>. 2013; <b>123</b>:19-26. doi: 10.1172/JCI62862.
GeneReviews Example:
Tranebjarg L, <i>et al</i>. <i>Jervell and Lange-Nielsen syndrome</i>. 2002 Jul 29 (Updated 2014 Nov 20). In: Pagon RA, <i>et al</i>., editors. GeneReviews (Internet). Seattle (WA): University of Washington, Seattle; 1993-2015. Available from: https://www.ncbi.nlm.nih.gov/books/NBK1405/.
"""
#special handling for GeneReviews books
if self.book_accession_id:
return cite.book(self, as_html=True)
return cite.article(as_html=True, **self.to_dict())
@property
def citation_bibtex(self):
if self.book_accession_id:
return cite.bibtex(isbook=True, **self.to_dict())
return cite.bibtex(**self.to_dict())
@property
def pubdate(self):
"""Normalized publication date as datetime object.
Returns the best available publication date from PubMed XML in order of preference:
1. Article PubDate (Year/Month/Day or MedlineDate)
2. Book contribution date
3. History dates (pubmed, entrez, etc.)
Returns:
datetime or None: Publication date as datetime object, or None if no date found
Example:
article = fetch.article_by_pmid('12345')
if article.pubdate:
print(f"Published: {article.pubdate.strftime('%Y-%m-%d')}")
"""
if self.pubmed_type == 'book':
# For books, use contribution date
return self.book_contribution_date
# For articles, try to construct from PubDate elements
pubdate_element = self.content.find(self._root + '/Article/Journal/JournalIssue/PubDate')
if pubdate_element is not None:
constructed_date = self._construct_datetime(pubdate_element)
if constructed_date:
return constructed_date
# Fallback to history dates if available
if self.history:
# Try common PubMed history statuses in order of preference
for status in ['pubmed', 'entrez', 'received', 'accepted']:
if status in self.history and self.history[status]:
return self.history[status]
return None
def _construct_datetime(self, d):
names = ['Year', 'Month', 'Day']
# if any part is missing, python will default to setting it to 1 anyway.
parts = {'year': 1, 'month': 1, 'day': 1}
# First try to parse structured date elements (Year, Month, Day)
found_structured_date = False
for name in names:
if d.find(name) is not None:
item = d.find(name).text
found_structured_date = True
try:
parts[name.lower()] = int(item)
except ValueError:
if name.lower() == 'year':
# fixes spurious crap seen at least once: "2007 (details online)" (pmid 19659763)
parts['year'] = int(item[:4])
elif name.lower() == 'month':
# Force to 3-letter month name (months can look like "December", "Dec", "1")
parts['month'] = time.strptime(item[:3], '%b').tm_mon
except TypeError:
# item is None
pass
# Check for Season element if no Month was found
if found_structured_date and parts['month'] == 1: # Only override default month
season_elem = d.find('Season')
if season_elem is not None and season_elem.text:
season_text = season_elem.text.strip().lower()
season_to_month = {
'spring': 3, # March
'summer': 6, # June
'fall': 9, # September
'autumn': 9, # September
'winter': 12 # December
}
if season_text in season_to_month:
parts['month'] = season_to_month[season_text]
# If we found structured dates, use them
if found_structured_date:
try:
return datetime(**parts)
except ValueError:
# one of the values didn't parse, or maybe it was like pmid 17924334
# where the "accepted" year was "20007". at any rate, forget it.
return None
# If no structured date, try MedlineDate
medline_elem = d.find('MedlineDate')
if medline_elem is not None and medline_elem.text:
return self._parse_medlinedate(medline_elem.text)
# No date information found
return None
def _parse_medlinedate(self, medline_text):
"""Parse MedlineDate strings like '2007 Spring', '1999-2000', '2007 Mar-Apr'"""
import re
if not medline_text:
return None
# Clean the text
text = medline_text.strip()
# Extract 4-digit year - look for first occurrence
year_match = re.search(r'\b(19|20)\d{2}\b', text)
if not year_match:
return None
year = int(year_match.group())
# Default to January 1st
month = 1
day = 1
# Try to extract month information
month_patterns = [
# Full month names
(r'\b(January|Jan)\b', 1), (r'\b(February|Feb)\b', 2), (r'\b(March|Mar)\b', 3),
(r'\b(April|Apr)\b', 4), (r'\b(May)\b', 5), (r'\b(June|Jun)\b', 6),
(r'\b(July|Jul)\b', 7), (r'\b(August|Aug)\b', 8), (r'\b(September|Sep)\b', 9),
(r'\b(October|Oct)\b', 10), (r'\b(November|Nov)\b', 11), (r'\b(December|Dec)\b', 12),
# Seasons (map to approximate months)
(r'\b(Spring)\b', 3), (r'\b(Summer)\b', 6), (r'\b(Fall|Autumn)\b', 9), (r'\b(Winter)\b', 12),
]
for pattern, month_num in month_patterns:
if re.search(pattern, text, re.IGNORECASE):
month = month_num
break
# Try to extract day if present
day_match = re.search(r'\b(\d{1,2})\b', text)
if day_match:
try:
potential_day = int(day_match.group())
if 1 <= potential_day <= 31:
day = potential_day
except ValueError:
pass
try:
return datetime(year=year, month=month, day=day)
except ValueError:
# Invalid date combination, fallback to year only
return datetime(year=year, month=1, day=1)
def _get_bookaccession_id(self):
for item in self.content.findall('BookDocument/ArticleIdList/ArticleId'):
if item.get('IdType') == 'bookaccession':
return item.text
def _get_book_title(self):
return self._get('BookDocument/Book/BookTitle')
def _get_book_articletitle(self):
return self._get('BookDocument/ArticleTitle')
def _get_book_authors(self):
authors = [_xml_au_to_last_fm(au) for au in self.content.findall('BookDocument/AuthorList/Author')]
return authors
def _get_book_author_list(self):
authors = [PubMedAuthor(au) for au in self.content.findall('BookDocument/AuthorList/Author')]
return authors
def _get_book_publisher(self):
return self._get('BookDocument/Book/Publisher/PublisherName')
def _get_book_publisher_location(self):
return self._get('BookDocument/Book/Publisher/PublisherLocation')
def _get_book_language(self):
return self._get('BookDocument/Language')
def _get_book_editors(self):
return [_xml_au_to_last_fm(au) for au in self.content.findall('BookDocument/Book/AuthorList/Author')]
def _get_book_abstracts(self):
abd = OrderedDict()
for item in self.content.findall('BookDocument/Abstract/AbstractText'):
abd[item.get('Label')] = self._extract_text(item)
return abd
def _get_book_sections(self):
sections = {}
for item in self.content.findall('BookDocument/Sections/Section'):
sec_title = item.find('SectionTitle')
sections[sec_title.get('sec')] = sec_title.text
return sections
def _get_book_abstract(self):
abstract_strs = ['%s: %s' % (key, val) for key, val in self.book_abstracts.items()]
return '\n'.join(abstract_strs)
def _get_book_copyright(self):
return self._get('BookDocument/Abstract/CopyrightInformation')
def _get_book_medium(self):
return self._get('BookDocument/Book/Medium')
def _get_book_contribution_date(self):
contribution_date_element = self.content.find('BookDocument/ContributionDate')
if contribution_date_element is not None:
return self._construct_datetime(self.content.find('BookDocument/ContributionDate'))
return None
def _get_book_date_revised(self):
return self._construct_datetime(self.content.find('BookDocument/DateRevised'))
def _get_book_synonyms(self):
syn_list = self.content.find('BookDocument/ItemList')
if syn_list is not None and syn_list.get('ListType') == 'Synonyms':
return [item.text for item in self.content.findall('BookDocument/ItemList/Item')]
else:
return []
def _get_book_history(self):
history = {}
items = self.content.findall('PubmedBookData/History/PubMedPubDate')
for item in items:
history[item.get('PubStatus')] = self._construct_datetime(item)
return history
def _get_book_publication_status(self):
return self._get('PubmedBookData/PublicationStatus')
def _get_book_year(self):
if self.book_contribution_date:
return self.book_contribution_date.year
return None
def _get_pmid(self):
return self._get(self._root+'/PMID')
def _get_url(self):
return 'https://ncbi.nlm.nih.gov/pubmed/'+str(self.pmid)
def _get_abstract(self):
abstracts = self.content.findall(self._root + '/Article/Abstract/AbstractText')
if abstracts == []:
return self._get(self._root+'/Article/Abstract/AbstractText')
if len(abstracts) == 1:
return self._extract_text(abstracts[0])
# This is a type of PMA with several AbstractText listings
# for a structured abstract, see https://www.nlm.nih.gov/bsd/policy/structured_abstracts.html
abd = OrderedDict()
for ab in abstracts:
abd[ab.get('Label')] = self._extract_text(ab)
return '\n'.join(['%s: %s' % (key, val) for key, val in abd.items()])
def _get_authors(self):
# N.B. Citations may have 0 authors. e.g., pmid:7550356
authors = [_xml_au_to_last_fm(au) for au in self.content.findall(self._root+'/Article/AuthorList/Author')]
return authors
def _get_author_list(self):
authors = [PubMedAuthor(au) for au in self.content.findall(self._root+'/Article/AuthorList/Author')]
return authors
def _get_authors_str(self):
return '; '.join(self.authors)
def _get_author1_last_fm(self):
""" return first author's name, in format Last INITS (space between surname and initials)"""
# return _xml_au_to_last_fm(self.content.find(self._root+'/Article/AuthorList/Author'))
if self.authors:
return self.authors[0]
else:
return None
def _get_author1_lastfm(self):
"""return first author's name, in format LastINITS (no space between surname and initials)"""
if self.author1_last_fm is not None:
return self.author1_last_fm.replace(' ', '')
return None
def _get_keywords(self):
keyword_list = [kw.text for kw in self.content.findall(self._root+'/KeywordList/Keyword')]
return keyword_list
def _get_journal(self):
j = self._get(self._root+'/Article/Journal/ISOAbbreviation')
if j is None:
# e.g., https://www.ncbi.nlm.nih.gov/pubmed?term=21242195
j = self._get(self._root+'/Article/Journal/Title')
return j
def _get_pages(self):
return self._get(self._root+'/Article/Pagination/MedlinePgn')
def _get_first_page(self):
try:
return self.pages.split('-')[0]
except AttributeError:
return self.pages
def _get_last_page(self):
try:
lastnum = self.pages.split('-')[1]
except (IndexError, AttributeError):
return None
try:
# Return true last page from pages attribute, i.e if self.pages is
# "148-52", return "152". If self.pages is "291-4", return "294".
if int(lastnum) < int(self.first_page):
return self.first_page[:-len(lastnum)] + lastnum
# If lastpage for some reason was not a number, just return it as-is.
except (ValueError, TypeError):
return lastnum
def _get_title(self):
return self._get(self._root+'/Article/ArticleTitle')
def _get_volume(self):
try:
return self.content.find(self._root+'/Article/Journal/JournalIssue/Volume').text
except AttributeError:
return None
def _get_issue(self):
try:
return self.content.find(self._root+'/Article/Journal/JournalIssue/Issue').text
except AttributeError:
return None
def _get_volume_issue(self):
ji = self.content.find(self._root+'/Article/Journal/JournalIssue')
try:
return '%s(%s)' % (ji.find('Volume').text, ji.find('Issue').text)
except AttributeError:
pass
try:
return ji.find('Volume').text
except AttributeError:
pass
# electronic pubs may not have volume or issue
# e.g., https://www.ncbi.nlm.nih.gov/pubmed?term=20860988
return None
def _get_article_history(self):
history = {}
pubdates = self.content.find('PubmedData/History')
if pubdates is not None:
for pubdate in pubdates.getchildren():
history[pubdate.get('PubStatus')] = self._construct_datetime(pubdate)
return history
def _get_year(self):
y = self._get(self._root+'/Article/Journal/JournalIssue/PubDate/Year')
if y is None:
# case applicable for pmid:9887384 (at least)
try:
y = self._get(self._root+'/Article/Journal/JournalIssue/PubDate/MedlineDate')[0:4]
except TypeError:
pass
return y
def _get_doi(self):
return self._get('PubmedData/ArticleIdList/ArticleId[@IdType="doi"]')
def _get_pii(self):
return self._get('PubmedData/ArticleIdList/ArticleId[@IdType="pii"]')
def _get_pmc(self):
try:
return self._get('PubmedData/ArticleIdList/ArticleId[@IdType="pmc"]')[3:]
except TypeError:
return None
def _get_issn(self):
return self._get(self._root+'/Article/Journal/ISSN')
def _get_mesh_headings(self):
if self.pubmed_type == 'book':
return None
meshtags = self.content.findall('MedlineCitation/MeshHeadingList/MeshHeading')
outd = {}
for mesh in meshtags:
descript = mesh.find('DescriptorName') # should always be present
dui = descript.get('UI')
qualifiers_list = []
for qual in mesh.findall('QualifierName'):
qualifiers_list.append({
'qualifier_name': qual.text,
'qualifier_ui': qual.get('UI'),
'qualifier_major_topic': True if qual.get('MajorTopicYN') == 'Y' else False,
})
outd[dui] = {
'descriptor_name': descript.text,
'descriptor_major_topic': True if descript.get('MajorTopicYN') == 'Y' else False,
'qualifiers': qualifiers_list,
}
return outd
def _get_chemicals(self):
if self.pubmed_type == 'book':
return None
outd = {}
chemicals = self.content.findall('MedlineCitation/ChemicalList/Chemical')
for chem in chemicals:
substance = chem.find('NameOfSubstance')
regnum = chem.find('RegistryNumber').text # very often this is '0'
outd[substance.get('UI')] = {
'substance_name': substance.text,
'registry_number': regnum
}
return outd
def _get_publication_types(self):
outd = {}
pubtypes = self.content.findall('MedlineCitation/Article/PublicationTypeList/PublicationType')
for pt in pubtypes:
outd[pt.get('UI')] = pt.text
return outd
def _get_grantlist(self):
outl = []
grants = self.content.findall('MedlineCitation/GrantList')
for gr in grants:
outl.append({'agency': gr.get('Agency', None), 'country': gr.get('Country', None)})
return outl
def __str__(self):
# [article example]
# Asensio C, Pérez-Díaz JC. A new family of low molecular weight antibiotics from enterobacteria. Biochem Biophys Res Commun. 1976 Mar 8;69(1):7-14.
if self.pubmed_type == 'article':
return '<PubMedArticle {pmid}> {authors_str}. {title}. {journal}. {year}. {volume_issue}:{pages}'.format(**self.to_dict())
else:
return '<PubMedBook {pmid}> {title}. {authors_str}. {book_title}. {year}'.format(**self.to_dict())
############################################################################
## Utilities
def _xml_au_to_last_fm(au):
"Medline XML specific conversion of author name to lastname-firstinitial format."
if au is None:
return
try:
return au.find('LastName').text + ' ' + au.find('Initials').text
except AttributeError:
pass
try:
return au.find('CollectiveName').text
except AttributeError:
pass
try:
return au.find('LastName').text
except AttributeError:
pass
raise MetaPubError("Author structure not recognized")
[docs]
def square_voliss_data_for_pma(pma):
""" Takes a PubMedArticle object, returns same object with corrected volume/issue
information (if needed)
"""
if pma.volume != None and pma.issue is None:
# try to get a number out of the parts that came after the first number.
volparts = re_numbers.findall(pma.volume)
if len(volparts) > 1:
pma.volume = volparts[0]
# take a guess. best we can do. this often works (e.g. Brain journal)
pma.issue = volparts[1]
if pma.issue and pma.volume:
if pma.issue.find('Pt') > -1:
pma.issue = re_numbers.findall(pma.issue)[0]
return pma
[docs]
def determine_pubmed_xml_type(xmlstr):
""" Returns string "type" of pubmed article XML based on presence of expected strings.
Possible returns:
'article'
'book'
'unknown'
:param xmlstr: xml in any data type (str, bytes, unicode...)
:return typestring: (str)
:rtype: str
"""
if type(xmlstr)==bytes:
xmlstr = xmlstr.decode()
if '<PubmedBookArticle>' in xmlstr:
return 'book'
elif '<PubmedArticle>' in xmlstr:
return 'article'
return 'unknown'