Source code for metapub.pubmedarticle

"""metapub.pubmedarticle -- PubMedArticle class instantiated by supplying ncbi XML string."""

import time
from datetime import datetime
from collections import OrderedDict

from .base import MetaPubObject
from .exceptions import MetaPubError
from .text_mining import re_numbers
from .pubmedauthor import PubMedAuthor

from . import cite



[docs]
class PubMedArticle(MetaPubObject):
    """This PubMedArticle class receives an XML string as its required argument
    and parses it into its constituent parts, exposing them as attributes.

    Usage:
        paper = PubMedArticle(xml_string)

    To query services to return an article by pmid, use PubMedFetcher, which
    returns PubMedArticle objects.

    When xmlstr is parsed, the `pubmed_type` attribute will be set to one of 'article' or 'book',
    depending on whether PubmedBookArticle or PubmedArticle headings are found in the supplied
    xmlstr at instantiation.

    Since this class needs to work seamlessly in production whether it's a book
    or an article, the PubmedArticle attributes will always be available (set to None in many
    cases for PubmedBookArticle, e.g. volume, issue, journal), but PubmedBookArticle
    attributes will only be set when pubmed_type='book'.

    PubMedBook special handling of certain attributes:
        * abstract: a joined string from self.book_abstracts
        * title: comes from ArticleTitle

    Special attributes for PubmedBookArticle (pubmed_type='book'):
        * book_id (default: None) - string from IdType="bookaccession", e.g. "NBK1403"
        * book_title (default: None) - string with name of book (as differentiated from ArticleTitle)
        * book_publisher (default: None) - dict containing {'name': string, 'location': string}
        * book_sections (default: []) - dict with key->value pairs as section_name->SectionTitle
        * book_contribution_date (default: None) - python datetime date
        * book_date_revised (default: None) - python datetime date
        * book_history (default: [])  - dictionary with key->value pairs as PubStatus -> python datetime
        * book_language (default: None) - string (e.g. "eng")
        * book_editors (default: []) - list containing names from 'editors' AuthorList
        * book_abstracts (default: []) - dict with key->value pairs as Label->AbstractText.text)
        * book_medium (default: None) - string (e.g. "Internet")
        * book_synonyms (default: None) - list of disease synonyms (applicable to "gene" book)
        * book_publication_status (default: None) - string (e.g. "ppublish")
    """


[docs]
    def __init__(self, xmlstr, *args, **kwargs):
        """Initialize PubMedArticle from NCBI XML data.
        
        Args:
            xmlstr (str): XML string from NCBI containing PubmedArticle or 
                PubmedBookArticle data.
            *args: Additional positional arguments passed to parent class.
            **kwargs: Additional keyword arguments passed to parent class.
        
        Note:
            The XML type is automatically detected to handle both regular articles
            and book chapters. The `pubmed_type` attribute will be set to 'article'
            or 'book' accordingly, and appropriate attributes will be populated.
        """
        self.pubmed_type = determine_pubmed_xml_type(xmlstr)

        if self.pubmed_type=='book':
            self._root = 'BookDocument'
            super(PubMedArticle, self).__init__(xmlstr, 'PubmedBookArticle', args, kwargs)
        elif self.pubmed_type=='article':
            self._root = 'MedlineCitation'
            super(PubMedArticle, self).__init__(xmlstr, 'PubmedArticle', args, kwargs)
        else:
            # assume we're here because of predownloaded Medline XML.
            self.pubmed_type = 'article'
            self._root = '.'
            super(PubMedArticle, self).__init__(xmlstr, None, args, kwargs)

        pmt = self.pubmed_type

        # shared between book and article types:
        self.pmid = self._get_pmid()
        self.url  = self._get_url()
        self.authors = self._get_authors() if pmt == 'article' else self._get_book_authors()
        self.author_list = self._get_author_list() if pmt == 'article' else self._get_book_author_list()
        self.title = self._get_title() if pmt == 'article' else self._get_book_articletitle()
        self.authors_str = self._get_authors_str()
        self.author1_last_fm = self._get_author1_last_fm()
        self.author1_lastfm = self._get_author1_lastfm()
        self.keywords = self._get_keywords()

        # 'article' only (not shared):
        self.pages = None if pmt == 'book' else self._get_pages()
        self.first_page = None if pmt == 'book' else self._get_first_page()
        self.last_page = None if pmt == 'book' else self._get_last_page()
        self.volume = None if pmt == 'book' else self._get_volume()
        self.issue = None if pmt == 'book' else self._get_issue()
        self.volume_issue = None if pmt == 'book' else self._get_volume_issue()
        self.doi = None if pmt == 'book' else self._get_doi()
        self.pii = None if pmt == 'book' else self._get_pii()
        self.pmc = None if pmt == 'book' else self._get_pmc()
        self.issn = None if pmt == 'book' else self._get_issn()

        # MeSH headings ('article' only)
        self.mesh = self._get_mesh_headings()

        # Chemical associations ('article' only)
        self.chemicals = self._get_chemicals()

        # Grant information (?? 'article' only ??)
        self.grants = self._get_grantlist()

        # Publication Types (?? 'article' only ??)
        self.publication_types = self._get_publication_types()

        # 'book' only:
        self.book_accession_id = None if pmt == 'article' else self._get_bookaccession_id()
        self.book_title = None if pmt == 'article' else self._get_book_title()
        self.book_publisher = None if pmt == 'article' else self._get_book_publisher()
        self.book_language = None if pmt == 'article' else self._get_book_language()
        self.book_editors = None if pmt == 'article' else self._get_book_editors()
        self.book_abstracts = None if pmt == 'article' else self._get_book_abstracts()
        self.book_sections = None if pmt == 'article' else self._get_book_sections()
        self.book_copyright = None if pmt == 'article' else self._get_book_copyright()
        self.book_medium = None if pmt == 'article' else self._get_book_medium()
        self.book_synonyms = None if pmt == 'article' else self._get_book_synonyms()
        self.book_publication_status = None if pmt == 'article' else self._get_book_publication_status()
        self.book_history = None if pmt == 'article' else self._get_book_history()
        self.book_contribution_date = None if pmt == 'article' else self._get_book_contribution_date()
        self.book_date_revised = None if pmt == 'article' else self._get_book_contribution_date()

        # the shared oddballs, must be done last.
        self.abstract = self._get_abstract() if pmt == 'article' else self._get_book_abstract()
        self.journal = self.book_title if pmt == 'book' else self._get_journal()
        self.year = self._get_book_year() if pmt == 'book' else self._get_year()

        self.history = self._get_article_history()



[docs]
    def to_dict(self):
        """Convert PubMedArticle to dictionary representation.
        
        Returns:
            Dict[str, Any]: Dictionary containing all article attributes except
                internal XML content and processing attributes.
        
        Note:
            Excludes 'content', 'xml', and '_root' attributes from the output
            to provide a clean data representation suitable for serialization.
        """
        outd = self.__dict__.copy()
        outd.pop('content')
        outd.pop('xml')
        outd.pop('_root')
        return self.__dict__


    @property
    def citation(self):
        """ Returns a formatted citation string built from this article's author(s), title,
        journal, year, volume, pages, and doi.

        Article Example:

        McNally EM, et al. Genetic mutations and mechanisms in dilated cardiomyopathy. Journal of Clinical Investigation. 2013; 123:19-26. doi: 10.1172/JCI62862.

        Book Example (GeneReviews):

        Tranebjarg L, et al. Jervell and Lange-Nielsen syndrome. 2002 Jul 29 (Updated 2014 Nov 20). In: Pagon RA, et al., editors. GeneReviews (Internet). Seattle (WA): University of Washington, Seattle; 1993-2015. Available from: https://www.ncbi.nlm.nih.gov/books/NBK1405/.
        """
        #special handling for GeneReviews books
        if self.book_accession_id:
            return cite.book(self)
        return cite.article(**self.to_dict())

    @property
    def citation_html(self):
        """ Returns a formatted citation string built from this article's author(s), title,
        journal, year, volume, and pages.

        Article Example:

        McNally EM, <i>et al</i>. Genetic mutations and mechanisms in dilated cardiomyopathy. <i>Journal of Clinical Investigation</i>. 2013; <b>123</b>:19-26. doi: 10.1172/JCI62862.

        GeneReviews Example:
        Tranebjarg L, <i>et al</i>. <i>Jervell and Lange-Nielsen syndrome</i>. 2002 Jul 29 (Updated 2014 Nov 20). In: Pagon RA, <i>et al</i>., editors. GeneReviews (Internet). Seattle (WA): University of Washington, Seattle; 1993-2015. Available from: https://www.ncbi.nlm.nih.gov/books/NBK1405/.
        """
        #special handling for GeneReviews books
        if self.book_accession_id:
            return cite.book(self, as_html=True)
        return cite.article(as_html=True, **self.to_dict())

    @property
    def citation_bibtex(self):
        if self.book_accession_id:
            return cite.bibtex(isbook=True, **self.to_dict())
        return cite.bibtex(**self.to_dict())
    
    @property
    def pubdate(self):
        """Normalized publication date as datetime object.
        
        Returns the best available publication date from PubMed XML in order of preference:
        1. Article PubDate (Year/Month/Day or MedlineDate)
        2. Book contribution date  
        3. History dates (pubmed, entrez, etc.)
        
        Returns:
            datetime or None: Publication date as datetime object, or None if no date found
            
        Example:
            article = fetch.article_by_pmid('12345')
            if article.pubdate:
                print(f"Published: {article.pubdate.strftime('%Y-%m-%d')}")
        """
        if self.pubmed_type == 'book':
            # For books, use contribution date
            return self.book_contribution_date
        
        # For articles, try to construct from PubDate elements
        pubdate_element = self.content.find(self._root + '/Article/Journal/JournalIssue/PubDate')
        if pubdate_element is not None:
            constructed_date = self._construct_datetime(pubdate_element)
            if constructed_date:
                return constructed_date
        
        # Fallback to history dates if available
        if self.history:
            # Try common PubMed history statuses in order of preference
            for status in ['pubmed', 'entrez', 'received', 'accepted']:
                if status in self.history and self.history[status]:
                    return self.history[status]
        
        return None

    def _construct_datetime(self, d):
        names = ['Year', 'Month', 'Day']
        # if any part is missing, python will default to setting it to 1 anyway.
        parts = {'year': 1, 'month': 1, 'day': 1}
        
        # First try to parse structured date elements (Year, Month, Day)
        found_structured_date = False
        for name in names:
            if d.find(name) is not None:
                item = d.find(name).text
                found_structured_date = True
                try:
                    parts[name.lower()] = int(item)
                except ValueError:
                    if name.lower() == 'year':
                        # fixes spurious crap seen at least once: "2007 (details online)" (pmid 19659763)
                        parts['year'] = int(item[:4])
                    elif name.lower() == 'month':
                        # Force to 3-letter month name (months can look like "December", "Dec", "1")
                        parts['month'] = time.strptime(item[:3], '%b').tm_mon
                except TypeError:
                    # item is None
                    pass
        
        # Check for Season element if no Month was found
        if found_structured_date and parts['month'] == 1:  # Only override default month
            season_elem = d.find('Season')
            if season_elem is not None and season_elem.text:
                season_text = season_elem.text.strip().lower()
                season_to_month = {
                    'spring': 3,   # March
                    'summer': 6,   # June  
                    'fall': 9,     # September
                    'autumn': 9,   # September
                    'winter': 12   # December
                }
                if season_text in season_to_month:
                    parts['month'] = season_to_month[season_text]
        
        # If we found structured dates, use them
        if found_structured_date:
            try:
                return datetime(**parts)
            except ValueError:
                # one of the values didn't parse, or maybe it was like pmid 17924334
                # where the "accepted" year was "20007". at any rate, forget it.
                return None
        
        # If no structured date, try MedlineDate
        medline_elem = d.find('MedlineDate')
        if medline_elem is not None and medline_elem.text:
            return self._parse_medlinedate(medline_elem.text)
        
        # No date information found
        return None
    
    def _parse_medlinedate(self, medline_text):
        """Parse MedlineDate strings like '2007 Spring', '1999-2000', '2007 Mar-Apr'"""
        import re
        
        if not medline_text:
            return None
        
        # Clean the text
        text = medline_text.strip()
        
        # Extract 4-digit year - look for first occurrence
        year_match = re.search(r'\b(19|20)\d{2}\b', text)
        if not year_match:
            return None
        
        year = int(year_match.group())
        
        # Default to January 1st
        month = 1
        day = 1
        
        # Try to extract month information
        month_patterns = [
            # Full month names
            (r'\b(January|Jan)\b', 1), (r'\b(February|Feb)\b', 2), (r'\b(March|Mar)\b', 3),
            (r'\b(April|Apr)\b', 4), (r'\b(May)\b', 5), (r'\b(June|Jun)\b', 6),
            (r'\b(July|Jul)\b', 7), (r'\b(August|Aug)\b', 8), (r'\b(September|Sep)\b', 9),
            (r'\b(October|Oct)\b', 10), (r'\b(November|Nov)\b', 11), (r'\b(December|Dec)\b', 12),
            # Seasons (map to approximate months)
            (r'\b(Spring)\b', 3), (r'\b(Summer)\b', 6), (r'\b(Fall|Autumn)\b', 9), (r'\b(Winter)\b', 12),
        ]
        
        for pattern, month_num in month_patterns:
            if re.search(pattern, text, re.IGNORECASE):
                month = month_num
                break
        
        # Try to extract day if present
        day_match = re.search(r'\b(\d{1,2})\b', text)
        if day_match:
            try:
                potential_day = int(day_match.group())
                if 1 <= potential_day <= 31:
                    day = potential_day
            except ValueError:
                pass
        
        try:
            return datetime(year=year, month=month, day=day)
        except ValueError:
            # Invalid date combination, fallback to year only
            return datetime(year=year, month=1, day=1)

    def _get_bookaccession_id(self):
        for item in self.content.findall('BookDocument/ArticleIdList/ArticleId'):
            if item.get('IdType') == 'bookaccession':
                return item.text

    def _get_book_title(self):
        return self._get('BookDocument/Book/BookTitle')

    def _get_book_articletitle(self):
        return self._get('BookDocument/ArticleTitle')

    def _get_book_authors(self):
        authors = [_xml_au_to_last_fm(au) for au in self.content.findall('BookDocument/AuthorList/Author')]
        return authors

    def _get_book_author_list(self):
        authors = [PubMedAuthor(au) for au in self.content.findall('BookDocument/AuthorList/Author')]
        return authors

    def _get_book_publisher(self):
        return self._get('BookDocument/Book/Publisher/PublisherName')

    def _get_book_publisher_location(self):
        return self._get('BookDocument/Book/Publisher/PublisherLocation')

    def _get_book_language(self):
        return self._get('BookDocument/Language')

    def _get_book_editors(self):
        return [_xml_au_to_last_fm(au) for au in self.content.findall('BookDocument/Book/AuthorList/Author')]

    def _get_book_abstracts(self):
        abd = OrderedDict()
        for item in self.content.findall('BookDocument/Abstract/AbstractText'):
            abd[item.get('Label')] = self._extract_text(item)
        return abd

    def _get_book_sections(self):
        sections = {}
        for item in self.content.findall('BookDocument/Sections/Section'):
            sec_title = item.find('SectionTitle')
            sections[sec_title.get('sec')] = sec_title.text
        return sections

    def _get_book_abstract(self):
        abstract_strs = ['%s: %s' % (key, val) for key, val in self.book_abstracts.items()]
        return '\n'.join(abstract_strs)

    def _get_book_copyright(self):
        return self._get('BookDocument/Abstract/CopyrightInformation')

    def _get_book_medium(self):
        return self._get('BookDocument/Book/Medium')

    def _get_book_contribution_date(self):
        contribution_date_element = self.content.find('BookDocument/ContributionDate')
        if contribution_date_element is not None:
            return self._construct_datetime(self.content.find('BookDocument/ContributionDate'))
        return None

    def _get_book_date_revised(self):
        return self._construct_datetime(self.content.find('BookDocument/DateRevised'))

    def _get_book_synonyms(self):
        syn_list = self.content.find('BookDocument/ItemList')
        if syn_list is not None and syn_list.get('ListType') == 'Synonyms':
            return [item.text for item in self.content.findall('BookDocument/ItemList/Item')]
        else:
            return []

    def _get_book_history(self):
        history = {}
        items = self.content.findall('PubmedBookData/History/PubMedPubDate')
        for item in items:
            history[item.get('PubStatus')] = self._construct_datetime(item)
        return history

    def _get_book_publication_status(self):
        return self._get('PubmedBookData/PublicationStatus')

    def _get_book_year(self):
        if self.book_contribution_date:
            return self.book_contribution_date.year
        return None

    def _get_pmid(self):
        return self._get(self._root+'/PMID')

    def _get_url(self):
        return 'https://ncbi.nlm.nih.gov/pubmed/'+str(self.pmid)

    def _get_abstract(self):
        abstracts = self.content.findall(self._root + '/Article/Abstract/AbstractText')
        if abstracts == []:
            return self._get(self._root+'/Article/Abstract/AbstractText')

        if len(abstracts) == 1:
            return self._extract_text(abstracts[0])

        # This is a type of PMA with several AbstractText listings
        # for a structured abstract, see https://www.nlm.nih.gov/bsd/policy/structured_abstracts.html 
        abd = OrderedDict()
        for ab in abstracts:
            abd[ab.get('Label')] = self._extract_text(ab)
        return '\n'.join(['%s: %s' % (key, val) for key, val in abd.items()])

    def _get_authors(self):
        # N.B. Citations may have 0 authors. e.g., pmid:7550356
        authors = [_xml_au_to_last_fm(au) for au in self.content.findall(self._root+'/Article/AuthorList/Author')]
        return authors

    def _get_author_list(self):
        authors = [PubMedAuthor(au) for au in self.content.findall(self._root+'/Article/AuthorList/Author')]
        return authors

    def _get_authors_str(self):
        return '; '.join(self.authors)

    def _get_author1_last_fm(self):
        """ return first author's name, in format Last INITS (space between surname and initials)"""
        # return _xml_au_to_last_fm(self.content.find(self._root+'/Article/AuthorList/Author'))
        if self.authors:
            return self.authors[0]
        else:
            return None

    def _get_author1_lastfm(self):
        """return first author's name, in format LastINITS (no space between surname and initials)"""
        if self.author1_last_fm is not None:
            return self.author1_last_fm.replace(' ', '')
        return None

    def _get_keywords(self):
        keyword_list = [kw.text for kw in self.content.findall(self._root+'/KeywordList/Keyword')]
        return keyword_list

    def _get_journal(self):
        j = self._get(self._root+'/Article/Journal/ISOAbbreviation')
        if j is None:
            # e.g., https://www.ncbi.nlm.nih.gov/pubmed?term=21242195
            j = self._get(self._root+'/Article/Journal/Title')
        return j

    def _get_pages(self):
        return self._get(self._root+'/Article/Pagination/MedlinePgn')

    def _get_first_page(self):
        try:
            return self.pages.split('-')[0]
        except AttributeError:
            return self.pages

    def _get_last_page(self):
        try:
            lastnum = self.pages.split('-')[1]
        except (IndexError, AttributeError):
            return None
        try:
            # Return true last page from pages attribute, i.e if self.pages is
            # "148-52", return "152".  If self.pages is "291-4", return "294".
            if int(lastnum) < int(self.first_page):
                return self.first_page[:-len(lastnum)] + lastnum

            # If lastpage for some reason was not a number, just return it as-is.
        except (ValueError, TypeError):
            return lastnum

    def _get_title(self):
        return self._get(self._root+'/Article/ArticleTitle')

    def _get_volume(self):
        try:
            return self.content.find(self._root+'/Article/Journal/JournalIssue/Volume').text
        except AttributeError:
            return None

    def _get_issue(self):
        try:
            return self.content.find(self._root+'/Article/Journal/JournalIssue/Issue').text
        except AttributeError:
            return None

    def _get_volume_issue(self):
        ji = self.content.find(self._root+'/Article/Journal/JournalIssue')
        try:
            return '%s(%s)' % (ji.find('Volume').text, ji.find('Issue').text)

        except AttributeError:
            pass
        try:
            return ji.find('Volume').text
        except AttributeError:
            pass
        # electronic pubs may not have volume or issue
        # e.g., https://www.ncbi.nlm.nih.gov/pubmed?term=20860988
        return None

    def _get_article_history(self):
        history = {}
        pubdates = self.content.find('PubmedData/History')
        if pubdates is not None:
            for pubdate in pubdates.getchildren():
                history[pubdate.get('PubStatus')] = self._construct_datetime(pubdate)
        return history

    def _get_year(self):
        y = self._get(self._root+'/Article/Journal/JournalIssue/PubDate/Year')
        if y is None:
            # case applicable for pmid:9887384 (at least)
            try:
                y = self._get(self._root+'/Article/Journal/JournalIssue/PubDate/MedlineDate')[0:4]
            except TypeError:
                pass
        return y

    def _get_doi(self):
        return self._get('PubmedData/ArticleIdList/ArticleId[@IdType="doi"]')

    def _get_pii(self):
        return self._get('PubmedData/ArticleIdList/ArticleId[@IdType="pii"]')

    def _get_pmc(self):
        try:
            return self._get('PubmedData/ArticleIdList/ArticleId[@IdType="pmc"]')[3:]
        except TypeError:
            return None

    def _get_issn(self):
        return self._get(self._root+'/Article/Journal/ISSN')

    def _get_mesh_headings(self):
        if self.pubmed_type == 'book':
            return None

        meshtags = self.content.findall('MedlineCitation/MeshHeadingList/MeshHeading')
        outd = {}
        for mesh in meshtags:
            descript = mesh.find('DescriptorName')  # should always be present
            dui = descript.get('UI')

            qualifiers_list = []
            for qual in mesh.findall('QualifierName'):
                qualifiers_list.append({
                    'qualifier_name': qual.text,
                    'qualifier_ui': qual.get('UI'),
                    'qualifier_major_topic': True if qual.get('MajorTopicYN') == 'Y' else False,
                })

            outd[dui] = {
                'descriptor_name': descript.text,
                'descriptor_major_topic': True if descript.get('MajorTopicYN') == 'Y' else False,
                'qualifiers': qualifiers_list,
            }
        return outd

    def _get_chemicals(self):
        if self.pubmed_type == 'book':
            return None

        outd = {}
        chemicals = self.content.findall('MedlineCitation/ChemicalList/Chemical')
        for chem in chemicals:
            substance = chem.find('NameOfSubstance')
            regnum = chem.find('RegistryNumber').text  # very often this is '0'
            outd[substance.get('UI')] = {
                    'substance_name': substance.text,
                    'registry_number': regnum
                }
        return outd

    def _get_publication_types(self):
        outd = {}
        pubtypes = self.content.findall('MedlineCitation/Article/PublicationTypeList/PublicationType')
        for pt in pubtypes:
            outd[pt.get('UI')] = pt.text
        return outd

    def _get_grantlist(self):
        outl = []
        grants = self.content.findall('MedlineCitation/GrantList')
        for gr in grants:
            outl.append({'agency': gr.get('Agency', None), 'country': gr.get('Country', None)})
        return outl

    def __str__(self):
        # [article example] 
        # Asensio C, Pérez-Díaz JC. A new family of low molecular weight antibiotics from enterobacteria. Biochem Biophys Res Commun. 1976 Mar 8;69(1):7-14. 
        if self.pubmed_type == 'article':
            return '<PubMedArticle {pmid}> {authors_str}. {title}. {journal}. {year}. {volume_issue}:{pages}'.format(**self.to_dict())
        else:
            return '<PubMedBook {pmid}> {title}. {authors_str}. {book_title}. {year}'.format(**self.to_dict())



############################################################################
## Utilities

def _xml_au_to_last_fm(au):
    "Medline XML specific conversion of author name to lastname-firstinitial format."

    if au is None:
        return
    try:
        return au.find('LastName').text + ' ' + au.find('Initials').text
    except AttributeError:
        pass
    try:
        return au.find('CollectiveName').text
    except AttributeError:
        pass
    try:
        return au.find('LastName').text
    except AttributeError:
        pass
    raise MetaPubError("Author structure not recognized")



[docs]
def square_voliss_data_for_pma(pma):
    """ Takes a PubMedArticle object, returns same object with corrected volume/issue
    information (if needed)
    """
    if pma.volume != None and pma.issue is None:
        # try to get a number out of the parts that came after the first number.
        volparts = re_numbers.findall(pma.volume)
        if len(volparts) > 1:
            pma.volume = volparts[0]
            # take a guess. best we can do. this often works (e.g. Brain journal)
            pma.issue = volparts[1]
    if pma.issue and pma.volume:
        if pma.issue.find('Pt') > -1:
            pma.issue = re_numbers.findall(pma.issue)[0]
    return pma



[docs]
def determine_pubmed_xml_type(xmlstr):
    """ Returns string "type" of pubmed article XML based on presence of expected strings.

    Possible returns:
        'article'
        'book'
        'unknown'

    :param xmlstr: xml in any data type (str, bytes, unicode...)
    :return typestring: (str)
    :rtype: str
    """
    if type(xmlstr)==bytes:
        xmlstr = xmlstr.decode()
    if '<PubmedBookArticle>' in xmlstr:
        return 'book'
    elif '<PubmedArticle>' in xmlstr:
        return 'article'

    return 'unknown'