Source code for xoutil.string

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# ---------------------------------------------------------------------
# xoutil.string
# ---------------------------------------------------------------------
# Copyright (c) 2015-2017 Merchise and Contributors
# Copyright (c) 2013, 2014 Merchise Autrement and Contributors
# Copyright (c) 2012 Medardo Rodriguez
# All rights reserved.
#
# Author: Medardo Rodríguez
# Contributors: see CONTRIBUTORS and HISTORY file
#
# This is free software; you can redistribute it and/or modify it under the
# terms of the LICENCE attached (see LICENCE file) in the distribution
# package.
#
# Created on Feb 17, 2012

'''Exposes all original `string` module functionalities, with some general
additions.

In this module `str` and `unicode` types are not used because Python 2.x and
Python 3.x treats strings differently.  `bytes` and `text_type` will be used
instead with the following conventions:

- In Python 2.x `str` is synonym of `bytes` and both (`unicode` and 'str') are
  both string types inheriting form `basestring`.

- In Python 3.x `str` is always unicode but `unicode` and `basestring` types
  doesn't exists. `bytes` type can be used as an array of one byte each item.

  Many methods are readjusted to these conditions.

'''

from __future__ import (division as _py3_division,
                        print_function as _py3_print,
                        # unicode_literals as _py3_unicode,
                        absolute_import as _py3_abs_imports)

from xoutil.deprecation import deprecated as _deprecated
from xoutil.eight import _py3

from xoutil.modules import copy_members as _copy_python_module_members
_pm = _copy_python_module_members()

Formatter = _pm.Formatter     # Redundant but needed to avoid IDE errors

del _copy_python_module_members, _pm


[docs]def force_encoding(encoding=None):
    '''Validates an encoding value; if None use `locale.getlocale()[1]`; else
    return the same value.

    .. versionadded:: 1.2.0

    '''
    # TODO: Maybe use only `sys.getdefaultencoding()`
    import locale
    return encoding or locale.getpreferredencoding() or 'UTF-8'


[docs]def safe_decode(s, encoding=None):
    '''Similar to bytes `decode` method returning unicode.

    Decodes `s` using the given `encoding`, or determining one from the system.

    Returning type depend on python version; if 2.x is `unicode` if 3.x `str`.

    .. versionadded:: 1.1.3

    '''
    from xoutil.eight import text_type
    if isinstance(s, text_type):
        return s
    else:
        encoding = force_encoding(encoding)
        try:
            # In Python 3 str(b'm') returns the string "b'm'" and not just "m",
            # this fixes this.
            return text_type(s, encoding, 'replace')
        except LookupError:
            # The provided enconding is not know, try with no encoding.
            return safe_decode(s)
        except:
            # For numbers and other stuff.
            return text_type(s)


[docs]def safe_encode(u, encoding=None):
    '''Similar to unicode `encode` method returning bytes.

    Encodes `u` using the given `encoding`, or determining one from the system.

    Returning type is always `bytes`; but in python 2.x is also `str`.

    .. versionadded:: 1.1.3

    '''
    # TODO: This is not nice for Python 3, bytes is not valid string any more
    #       See :func:`json.encoder.py_encode_basestring_ascii` of Python 2.x
    from xoutil.eight import string_types, text_type
    if isinstance(u, bytes):
        return u
    else:
        encoding = force_encoding(encoding)
        try:
            try:
                if isinstance(u, string_types):
                    # In Python 2.x bytes does not allows an encoding argument.
                    return bytes(u)
                else:
                    return text_type(u).encode(encoding, 'replace')
            except:
                return text_type(u).encode(encoding, 'replace')
        except LookupError:
            return safe_encode(u)


[docs]def safe_str(obj=str()):
    '''Convert to normal string type in a safe way.

    Most of our Python 2.x code uses unicode as normal string, also in
    Python 3 converting bytes or byte-arrays to strings includes the "b"
    prefix in the resulting value.

    This function is useful in some scenarios that require `str` type (for
    example attribute ``__name__`` in functions and types).

    As ``str is bytes`` in Python2, using str(value) assures correct these
    scenarios in most cases, but in other is not enough, for example::

      >>> from xoutil.string import safe_str as sstr
      >>> def inverted_partial(func, *args, **keywords):
      ...     def inner(*a, **kw):
      ...         a += args
      ...         kw.update(keywords)
      ...         return func(*a, **kw)
      ...     inner.__name__ = sstr(func.__name__.replace('lambda', u'λ'))
      ...     return inner

    .. versionadded:: 1.7.0

    '''
    if _py3:
        if isinstance(obj, (bytes, bytearray)):
            return safe_decode(obj)
        else:
            return str(obj)
    else:
        try:
            return str(obj)
        except UnicodeEncodeError:
            # assert isinstance(value, unicode)
            return safe_encode(obj)


[docs]def safe_join(separator, iterable, encoding=None):
    '''Similar to `join` method in string objects `separator.join(iterable)`, a
    string which is the concatenation of the strings in the `iterable` with
    `separator` as intermediate between elements. Return unicode or bytes
    depending on type of `separator` and each item in `iterable`.

    `encoding` is used in case of error to concatenate bytes + unicode.

    This function must be deprecated in Python 3.

    .. versionadded:: 1.1.3

    .. warning:: The `force_separator_type` was removed in version 1.2.0.

    '''
    try:
        return separator.join(iterable)
    except:
        pass
    encoding = force_encoding(encoding)
    empty = True
    for item in iterable:
        if empty:
            res = item
            empty = False
        else:
            for tail in (separator, item):
                try:
                    res += tail
                except:
                    res = (safe_decode(res, encoding) +
                           safe_decode(item, encoding))
    return res if not empty else type(separator)()


# Makes explicit the deprecation warning for py3k.
if _py3:
    safe_join = _deprecated('builtin join method of str',
                            'safe_join is deprecated for Python 3. Use '
                            'builtin join method of str.')(safe_join)


[docs]def safe_strip(value):
    '''Removes the leading and tailing space-chars from `value` if string, else
    return `value` unchanged.

    .. versionadded:: 1.1.3

    '''
    from xoutil.eight import string_types
    return value.strip() if isinstance(value, string_types) else value


[docs]def cut_prefix(value, prefix):
    '''Removes the leading `prefix` if exists, else return `value`
    unchanged.

    '''
    from xoutil.eight import text_type as str, binary_type as bytes
    if isinstance(value, str) and isinstance(prefix, bytes):
        prefix = safe_decode(prefix)
    elif isinstance(value, bytes) and isinstance(prefix, str):
        prefix = safe_encode(prefix)
    return value[len(prefix):] if value.startswith(prefix) else value


[docs]def cut_any_prefix(value, *prefixes):
    '''Apply `cut_prefix`:func: for the first matching prefix.'''
    result = prev = value
    i, top = 0, len(prefixes)
    while i < top and result == prev:
        prefix, i = prefixes[i], i + 1
        prev, result = result, cut_prefix(prev, prefix)
    return result


[docs]def cut_prefixes(value, *prefixes):
    '''Apply `cut_prefix`:func: for all provided prefixes in order.'''
    result = value
    for prefix in prefixes:
        result = cut_prefix(result, prefix)
    return result


[docs]def cut_suffix(value, suffix):
    '''Removes the tailing `suffix` if exists, else return `value`
    unchanged.

    '''
    from xoutil.eight import text_type as str, binary_type as bytes
    if isinstance(value, str) and isinstance(suffix, bytes):
        suffix = safe_decode(suffix)
    elif isinstance(value, bytes) and isinstance(suffix, str):
        suffix = safe_encode(suffix)
    # Since value.endswith('') is always true but value[:-0] is actually
    # always value[:0], which is always '', we have to explictly test for
    # len(suffix)
    if len(suffix) > 0 and value.endswith(suffix):
        return value[:-len(suffix)]
    else:
        return value


[docs]def cut_any_suffix(value, *suffixes):
    '''Apply `cut_suffix`:func: for the first matching suffix.'''
    result = prev = value
    i, top = 0, len(suffixes)
    while i < top and result == prev:
        suffix, i = suffixes[i], i + 1
        prev, result = result, cut_suffix(prev, suffix)
    return result


[docs]def cut_suffixes(value, *suffixes):
    '''Apply `cut_suffix`:func: for all provided suffixes in order.'''
    result = value
    for suffix in suffixes:
        result = cut_suffix(result, suffix)
    return result


[docs]def capitalize_word(value):
    'Capitalizes the first char of value'
    if value and value[0].islower():
        return value[0].upper() + value[1:]
    else:
        return value


[docs]def capitalize(value, title=True):
    '''Capitalizes value according to whether it should be title-like.

    Title-like means it will capitalize every word but the 3-letters or less
    unless its the first word::

        >>> capitalize('a group is its own worst enemy')
        'A Group is its own Worst Enemy'

    (This may be odd because, in the example above, own should be capitalized.)

    Return bytes or unicode depending on type of `value`.

        >>> from xoutil.eight import text_type
        >>> type(capitalize(text_type('something'))) is text_type
        True

        >>> type(capitalize(str('something'))) is str
        True

    '''
    tstr = type(value)
    space, empty = tstr(' '), tstr('')
    words = value.split() if value else None
    if words:
        count = len(words) if title else 1
        for i in range(count):
            word = words[i]
            if len(word) > 3 or i == 0:
                word = capitalize_word(word)
                words[i] = word
        return space.join(words)
    else:
        return empty


[docs]def hyphen_name(name):
    '''Convert a name, normally an identifier, to a hyphened slug.

    All transitions from lower to upper capitals (or from digits to letters)
    are joined with a hyphen.

    Also, all invalid characters (those invalid in Python identifiers) are
    converted to hyphens.

    For example::

      >>> hyphen_name('BaseNode') == 'base-node'
      True

    '''
    import re
    regex = re.compile('([a-z0-9][A-Z]|[a-zA-Z][0-9]|[0-9][a-z])')
    parts = []
    for m in reversed(list(regex.finditer(name))):
        i, f = m.span()
        name, tail = name[:i + 1], name[i + 1:]
        parts.insert(0, tail)
    parts.insert(0, name)
    name = '-'.join(parts)
    return safe_str(normalize_slug(name, '-', '_'))


# TODO: Document and fix all these "normalize_..." functions
[docs]def normalize_unicode(value):
    # FIXME: i18n
    if (value is None) or (value is str('')):
        return ''
    elif value is True:
        return safe_decode('Sí')
    elif value is False:
        return safe_decode('No')
    else:
        return safe_decode(value)


[docs]def normalize_name(value):
    return capitalize(normalize_unicode(value))


[docs]def normalize_title(value):
    return capitalize(normalize_unicode(value), True)


[docs]def normalize_str(value):
    import re
    is_bytes = isinstance(value, bytes)
    regex, sep = r'(\S+)\s*', ' '
    if is_bytes:
        regex, sep = bytes(regex), bytes(sep)
    regex = re.compile(regex)
    matches = regex.findall(value)
    names = (m.capitalize() if len(m) >= 3 else m.lower() for m in matches)
    return sep.join(names)


[docs]def normalize_ascii(value):
    '''Return the string normal form for the `value`

    Convert all non-ascii to valid characters using unicode 'NFKC'
    normalization.

    '''
    import unicodedata
    from xoutil.eight import text_type
    if not isinstance(value, text_type):
        value = safe_decode(value)
    res = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
    return safe_str(res)


[docs]def normalize_slug(value, replacement='-', invalids=None, valids=None):
    '''Return the string normal form, valid for slugs, for the `value`

    Convert all non-ascii to valid characters using unicode 'NFKC'
    normalization.

    Lower-case the result.

    Replace unwanted characters by `replacement`, repetition of given pattern
    will be converted to only one instance.

    .. warning:: There's a known bug when `replacement` contains '\'.

    ``[_a-z0-9]`` are assumed as valid characters.  Extra arguments can modify
    this standard behaviour:

    :param invalids: Any collection of characters added to these that are
           normally invalid in the provided `value`. (non-ascii or not
           included in valid characters).  Boolean ``True`` can be passed as a
           synonymous of ``"_"`` for compatibility with old
           ``invalid_underscore`` argument.  ``False`` or ``None`` are assumed
           as an empty set for invalid characters.

    :param valids: A collection of extra valid characters (all non-ascii
           characters are ignored).  This parameter could be either a valid
           string, any iterator of valid strings of characters, or ``None`` to
           use only default valid characters (See above).

    .. warning:: The result may contain characters in `invalids` if
                 `replacements` does.

    Parameters `value` and `replacement` could be of any (non-string) type,
    these values are normalized and converted to lower-case ASCII strings.

    Examples::

      >>> normalize_slug('  Á.e i  Ó  u  ') == 'a-e-i-o-u'
      True

      >>> normalize_slug('  Á.e i  Ó  u  ', '.', invalids='AU') == 'e.i.o'
      True

      >>> normalize_slug('  Á.e i  Ó  u  ', valids='.') == 'a.e-i-o-u'
      True

      >>> normalize_slug('_x', '_') == '_x'
      True

      >>> normalize_slug('-x', '_') == 'x'
      True

      >>> normalize_slug(None) == 'none'
      True

      >>> normalize_slug(1 == 1)  == 'true'
      True

      >>> normalize_slug(1.0) == '1-0'
      True

      >>> normalize_slug(135) == '135'
      True

      >>> normalize_slug(123456, '', invalids='52') == '1346'
      True

      >>> normalize_slug('_x', '_') == '_x'
      True

    .. versionchanged:: 1.5.5 Added the `invalid_underscore` parameter.

    .. versionchanged:: 1.6.6 Replaced the `invalid_underscore` paremeter by
       `invalids`.  Added the `valids` parameter.

    .. versionchanged:: 1.7.2 Clarified the role of `invalids` with regards to
       `replacement`.

    '''
    import re
    from xoutil.eight import string_types
    # local functions
    _normalize = lambda v: normalize_ascii(v).lower()
    _set = lambda v: ''.join(set(v))
    _esc = lambda v: re.escape(_set(v))
    _from_iter = lambda v: ''.join(i for i in v)
    # check and adjust arguments
    if replacement in (None, False):
        replacement = ''
    elif isinstance(replacement, string_types):
        replacement = normalize_ascii(replacement)    # TODO: or _normalize?
    else:
        msg = '`replacement` (%s) must be a string or None, not `%s`.'
        raise TypeError(msg % (replacement, type(replacement)))
    if invalids is True:
        # Backward compatibility with former `invalid_underscore` argument
        invalids = '_'
    elif invalids in {None, False}:
        invalids = ''
    else:
        if not isinstance(invalids, string_types):
            invalids = _from_iter(invalids)
        invalids = _esc(_normalize(invalids))
    if valids is None:
        valids = ''
    else:
        if not isinstance(valids, string_types):
            valids = _from_iter(valids)
        valids = _esc(re.sub(r'[0-9a-b]+', '', _normalize(valids)))
    # calculate result
    res = _normalize(value)
    regex = re.compile(r'[^_a-z0-9%s]+' % valids)
    repl = '\t' if replacement else ''
    res = regex.sub(repl, res)
    if invalids:
        regex = re.compile(r'[%s]+' % invalids)
        res = regex.sub(repl, res)
    if repl:
        r = {'r': r'%s' % re.escape(repl)}
        regex = re.compile(r'(%(r)s){2,}' % r)
        res = regex.sub(repl, res)
        regex = re.compile(r'(^%(r)s+|%(r)s+$)' % r)
        res = regex.sub('', res)
        regex = re.compile(r'[\t]' % r)
        res = regex.sub(replacement, res)
    return res


[docs]def strfnumber(number, format_spec='%0.2f'):
    res = format_spec % number
    if '.' in res:
        res = res.rstrip('0')
        if res.endswith('.'):
            res = res[:-1]
    return res


[docs]def parse_boolean(value):
    '''Parse a boolean from any value given a special treatment to
    strings.

    >>> parse_boolean('trUe')
    True

    >>> parse_boolean('faLSe')
    False

    '''
    from xoutil.eight import string_types
    if isinstance(value, string_types):
        value = value.strip()
        if value:
            if value.isdigit():
                return bool(int(value))
            else:
                if isinstance(value, bytes):
                    falses = (b'false', b'no', b'not')
                else:
                    falses = ('false', 'no', 'not')
                return value.lower() not in falses
        else:
            return False
    else:
        return bool(value)


[docs]def parse_url_int(value, default=None):
    '''Parse an integer URL argument. Some operations treat simple
    arguments as a list of one element.

    '''
    # TODO: Move to `xoutil.web`
    if isinstance(value, (list, tuple, set)) and len(value) > 0:
        value = value[0]
    try:
        return int(safe_strip(value))
    except:
        return default


[docs]def error2str(error):
    '''Convert an error to string.'''
    from xoutil.eight import string_types
    from xoutil.types import type_coerce
    if isinstance(error, string_types):
        return safe_str(error)
    elif isinstance(error, BaseException):
        tname = type(error).__name__
        res = safe_str(error)
        if tname in res:
            return res
        else:
            return str(': ').join(tname, res) if res else tname
    elif issubclass(error, BaseException):
        return type(error).__name__
    else:
        prefix = str('unknown error: ')
        cls = type_coerce(error)
        tname = cls.__name__
        if cls is error:
            res = tname
        else:
            res = safe_str(error)
            if tname not in res:
                res = str('{}({})').format(tname, res) if res else tname
        return prefix + res


[docs]def force_str(value, encoding=None):
    '''Force to string, the type is different in Python 2 or 3 (bytes or
    unicode).

    :param value: The value to convert to `str`.
    :param encoding: The encoding which should be used if either encoding
                     or decoding should be performed on `value`.

                     The default is to use the same default as
                     :func:`safe_encode` or :func:`safe_decode`.

    .. versionadded:: 1.2.0

    '''
    if isinstance(value, str):
        return value
    elif str is bytes:      # Python 2
        return safe_encode(value, encoding)
    else:
        return safe_decode(value, encoding)


[docs]def make_a10z(string):
    '''Utility to find out that "internationalization" is "i18n".

    Examples::

       >>> print(make_a10z('parametrization'))
       p13n
    '''
    return string[0] + str(len(string[1:-1])) + string[-1]


from xoutil.eight import input

input = _deprecated(
    input,
    "xoutil.string.input is deprecated.  Use xoutil.eight.input"
)(input)