Source code for metapub.utils

import unicodedata

from unidecode import unidecode

from urllib.parse import urlparse, unquote

PUNCS_WE_DONT_LIKE = "[],.()<>'/?;:\"&"



[docs]
def kpick(args, options, default=None):
    for opt in options:
        if args.get(opt, None):
            return args[opt]
    return default




[docs]
def remove_chars(inp, chars=PUNCS_WE_DONT_LIKE, urldecode=False):
    """ Remove target characters from input string.

    :param inp: (str)
    :param chars: (str) characters to remove [default: utils.PUNCS_WE_DONT_LIKE]
    :param urldecode: (bool) whether to first urldecode the input string [default: False]
    """
    if urldecode:
        inp = unquote(inp)

    for char in chars:
        inp = inp.replace(char, '')
    return inp




[docs]
def hostname_of(url):
    """ Takes a url (may or may not contain protocol prefix) and returns the simplest base form of the 
    hostname in the supplied URL.
    
    If hostname starts with 'www.', this will be stripped out.

    Examples: 
        http://www.nature.com/pr/journal/v49/n1/full/pr20018a.html --> nature.com
        https://webhome.weizmann.ac.il --> webhome.weizmann.ac.il
        https://www.ncbi.nlm.nih.gov/pubmed/17108762 --> ncbi.nlm.nih.gov

    :param url: (str)
    :return hostname: (str)
    """
    if url.startswith('http'):
        hostname = urlparse(url).hostname
    else:
        hostname = urlparse('http://' + url).hostname

    if hostname.startswith('www'):
        hostname = hostname.replace('www.', '')
    return hostname




[docs]
def rootdomain_of(url):
    """ Returns the root domain of hostname of supplied URL. 

    Examples:
        http://blood.oxfordjournals.org --> oxfordjournals.org
        https://webhome.weizmann.ac.il --> ac.il
        https://regex101.com/ --> regex101.com
        https://www.ncbi.nlm.nih.gov/pubmed/17108762 --> nih.gov

    :param url: (str)
    :return rootdomain: (str)
    """
    hostname = hostname_of(url)
    return '.'.join(hostname.split('.')[-2:])




[docs]
def asciify(inp):
    """ Nuke all the unicode from orbit. It's the only way to be sure.

    WARNING: this function is mostly used for Python2 compatibility and other legacy stuff,
    and may be removed in upcoming versions of metapub.

    :param inp: (str)
    :return: string converted to pure, American ASCII
    """
    # TODO: be more diplomatic than an atomic bomb: convert international chars to ascii equivalents.
    # see http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
    DeprecationWarning('asciify is a relic of The Great Python2 Unicode Wars, and will be removed in v0.6.0.')
    if inp:
        try:
            return inp.encode('ascii', 'ignore')
        except UnicodeDecodeError:
            return unicodedata.normalize('NFKD', inp.decode('utf-8')).encode('ascii', 'ignore')
    else:
        return ''




[docs]
def squash_spaces(inp):
    """ Convert multiple ' ' chars to a single space.

    :param inp: (str)
    :return: same string with only one space where multiple spaces were.
    """
    return ' '.join(inp.split())




[docs]
def parameterize(inp, sep='+'):
    """ Make strings suitable for submission to GET-based query service.

    Strips out the characters named in metapub.utils.PUNCS_WE_DONT_LIKE

    If inp is None, return empty string.

    :param inp: (str or None): input to be parameterized
    :param sep: (str): separator to use in place of spaces (default='+')
    :return: "parameterized" str
    """
    if inp is None:
        return ''

    inp = remove_chars(inp, PUNCS_WE_DONT_LIKE) 
    inp = squash_spaces(inp).replace(' ', sep)

    return unidecode(inp)




[docs]
def deparameterize(inp, sep='+'):
    """ Somewhat-undo parameterization in string. Replace separators (sep) with spaces.

    :param inp: (str)
    :param sep: (str) default: '+'
    :return: "deparameterized" string
    """
    return inp.replace(sep, ' ')




[docs]
def remove_html_markup(inp):
    """ Remove html and xml tags from text.
    Preserves HTML entities like &amp;

    :param inp: (str)
    :return: string with HTML and XML markup removed.
    """
    tag = False
    quote = False
    out = ""

    for char in inp:
        if char == '<' and not quote:
            tag = True
        elif char == '>' and not quote:
            tag = False
        elif (char == '"' or char == "'") and tag:
            quote = not quote
        elif not tag:
            out = out + char
    return out




[docs]
def lowercase_keys(dct):
    """ Takes an input dictionary, returns dictionary with all keys lowercased. """
    result = {}
    for key, value in list(dct.items()):
        result[key.lower()] = value
    return result