Source code for xoutil.string

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# ---------------------------------------------------------------------
# xoutil.string
# ---------------------------------------------------------------------
# Copyright (c) 2015-2017 Merchise and Contributors
# Copyright (c) 2013, 2014 Merchise Autrement and Contributors
# Copyright (c) 2012 Medardo Rodriguez
# All rights reserved.
#
# Author: Medardo Rodríguez
# Contributors: see CONTRIBUTORS and HISTORY file
#
# This is free software; you can redistribute it and/or modify it under the
# terms of the LICENCE attached (see LICENCE file) in the distribution
# package.
#
# Created on Feb 17, 2012

'''Exposes all original `string` module functionalities, with some general
additions.

In this module `str` and `unicode` types are not used because Python 2.x and
Python 3.x treats strings differently.  `bytes` and `text_type` will be used
instead with the following conventions:

- In Python 2.x `str` is synonym of `bytes` and both (`unicode` and 'str') are
  both string types inheriting form `basestring`.

- In Python 3.x `str` is always unicode but `unicode` and `basestring` types
  doesn't exists. `bytes` type can be used as an array of one byte each item.

  Many methods are readjusted to these conditions.

'''

from __future__ import (division as _py3_division,
                        print_function as _py3_print,
                        # unicode_literals as _py3_unicode,
                        absolute_import as _py3_abs_imports)

from xoutil.deprecation import deprecated as _deprecated
from xoutil.eight import _py3

from xoutil.modules import copy_members as _copy_python_module_members
_pm = _copy_python_module_members()

Formatter = _pm.Formatter     # Redundant but needed to avoid IDE errors

del _copy_python_module_members, _pm


[docs]def force_encoding(encoding=None): '''Validates an encoding value; if None use `locale.getlocale()[1]`; else return the same value. .. versionadded:: 1.2.0 ''' # TODO: Maybe use only `sys.getdefaultencoding()` import locale return encoding or locale.getpreferredencoding() or 'UTF-8'
[docs]def safe_decode(s, encoding=None): '''Similar to bytes `decode` method returning unicode. Decodes `s` using the given `encoding`, or determining one from the system. Returning type depend on python version; if 2.x is `unicode` if 3.x `str`. .. versionadded:: 1.1.3 ''' from xoutil.eight import text_type if isinstance(s, text_type): return s else: encoding = force_encoding(encoding) try: # In Python 3 str(b'm') returns the string "b'm'" and not just "m", # this fixes this. return text_type(s, encoding, 'replace') except LookupError: # The provided enconding is not know, try with no encoding. return safe_decode(s) except: # For numbers and other stuff. return text_type(s)
[docs]def safe_encode(u, encoding=None): '''Similar to unicode `encode` method returning bytes. Encodes `u` using the given `encoding`, or determining one from the system. Returning type is always `bytes`; but in python 2.x is also `str`. .. versionadded:: 1.1.3 ''' # TODO: This is not nice for Python 3, bytes is not valid string any more # See :func:`json.encoder.py_encode_basestring_ascii` of Python 2.x from xoutil.eight import string_types, text_type if isinstance(u, bytes): return u else: encoding = force_encoding(encoding) try: try: if isinstance(u, string_types): # In Python 2.x bytes does not allows an encoding argument. return bytes(u) else: return text_type(u).encode(encoding, 'replace') except: return text_type(u).encode(encoding, 'replace') except LookupError: return safe_encode(u)
[docs]def safe_str(obj=str()): '''Convert to normal string type in a safe way. Most of our Python 2.x code uses unicode as normal string, also in Python 3 converting bytes or byte-arrays to strings includes the "b" prefix in the resulting value. This function is useful in some scenarios that require `str` type (for example attribute ``__name__`` in functions and types). As ``str is bytes`` in Python2, using str(value) assures correct these scenarios in most cases, but in other is not enough, for example:: >>> from xoutil.string import safe_str as sstr >>> def inverted_partial(func, *args, **keywords): ... def inner(*a, **kw): ... a += args ... kw.update(keywords) ... return func(*a, **kw) ... inner.__name__ = sstr(func.__name__.replace('lambda', u'λ')) ... return inner .. versionadded:: 1.7.0 ''' if _py3: if isinstance(obj, (bytes, bytearray)): return safe_decode(obj) else: return str(obj) else: try: return str(obj) except UnicodeEncodeError: # assert isinstance(value, unicode) return safe_encode(obj)
[docs]def safe_join(separator, iterable, encoding=None): '''Similar to `join` method in string objects `separator.join(iterable)`, a string which is the concatenation of the strings in the `iterable` with `separator` as intermediate between elements. Return unicode or bytes depending on type of `separator` and each item in `iterable`. `encoding` is used in case of error to concatenate bytes + unicode. This function must be deprecated in Python 3. .. versionadded:: 1.1.3 .. warning:: The `force_separator_type` was removed in version 1.2.0. ''' try: return separator.join(iterable) except: pass encoding = force_encoding(encoding) empty = True for item in iterable: if empty: res = item empty = False else: for tail in (separator, item): try: res += tail except: res = (safe_decode(res, encoding) + safe_decode(item, encoding)) return res if not empty else type(separator)()
# Makes explicit the deprecation warning for py3k. if _py3: safe_join = _deprecated('builtin join method of str', 'safe_join is deprecated for Python 3. Use ' 'builtin join method of str.')(safe_join)
[docs]def safe_strip(value): '''Removes the leading and tailing space-chars from `value` if string, else return `value` unchanged. .. versionadded:: 1.1.3 ''' from xoutil.eight import string_types return value.strip() if isinstance(value, string_types) else value
[docs]def cut_prefix(value, prefix): '''Removes the leading `prefix` if exists, else return `value` unchanged. ''' from xoutil.eight import text_type as str, binary_type as bytes if isinstance(value, str) and isinstance(prefix, bytes): prefix = safe_decode(prefix) elif isinstance(value, bytes) and isinstance(prefix, str): prefix = safe_encode(prefix) return value[len(prefix):] if value.startswith(prefix) else value
[docs]def cut_any_prefix(value, *prefixes): '''Apply `cut_prefix`:func: for the first matching prefix.''' result = prev = value i, top = 0, len(prefixes) while i < top and result == prev: prefix, i = prefixes[i], i + 1 prev, result = result, cut_prefix(prev, prefix) return result
[docs]def cut_prefixes(value, *prefixes): '''Apply `cut_prefix`:func: for all provided prefixes in order.''' result = value for prefix in prefixes: result = cut_prefix(result, prefix) return result
[docs]def cut_suffix(value, suffix): '''Removes the tailing `suffix` if exists, else return `value` unchanged. ''' from xoutil.eight import text_type as str, binary_type as bytes if isinstance(value, str) and isinstance(suffix, bytes): suffix = safe_decode(suffix) elif isinstance(value, bytes) and isinstance(suffix, str): suffix = safe_encode(suffix) # Since value.endswith('') is always true but value[:-0] is actually # always value[:0], which is always '', we have to explictly test for # len(suffix) if len(suffix) > 0 and value.endswith(suffix): return value[:-len(suffix)] else: return value
[docs]def cut_any_suffix(value, *suffixes): '''Apply `cut_suffix`:func: for the first matching suffix.''' result = prev = value i, top = 0, len(suffixes) while i < top and result == prev: suffix, i = suffixes[i], i + 1 prev, result = result, cut_suffix(prev, suffix) return result
[docs]def cut_suffixes(value, *suffixes): '''Apply `cut_suffix`:func: for all provided suffixes in order.''' result = value for suffix in suffixes: result = cut_suffix(result, suffix) return result
[docs]def capitalize_word(value): 'Capitalizes the first char of value' if value and value[0].islower(): return value[0].upper() + value[1:] else: return value
[docs]def capitalize(value, title=True): '''Capitalizes value according to whether it should be title-like. Title-like means it will capitalize every word but the 3-letters or less unless its the first word:: >>> capitalize('a group is its own worst enemy') 'A Group is its own Worst Enemy' (This may be odd because, in the example above, own should be capitalized.) Return bytes or unicode depending on type of `value`. >>> from xoutil.eight import text_type >>> type(capitalize(text_type('something'))) is text_type True >>> type(capitalize(str('something'))) is str True ''' tstr = type(value) space, empty = tstr(' '), tstr('') words = value.split() if value else None if words: count = len(words) if title else 1 for i in range(count): word = words[i] if len(word) > 3 or i == 0: word = capitalize_word(word) words[i] = word return space.join(words) else: return empty
[docs]def hyphen_name(name): '''Convert a name, normally an identifier, to a hyphened slug. All transitions from lower to upper capitals (or from digits to letters) are joined with a hyphen. Also, all invalid characters (those invalid in Python identifiers) are converted to hyphens. For example:: >>> hyphen_name('BaseNode') == 'base-node' True ''' import re regex = re.compile('([a-z0-9][A-Z]|[a-zA-Z][0-9]|[0-9][a-z])') parts = [] for m in reversed(list(regex.finditer(name))): i, f = m.span() name, tail = name[:i + 1], name[i + 1:] parts.insert(0, tail) parts.insert(0, name) name = '-'.join(parts) return safe_str(normalize_slug(name, '-', '_'))
# TODO: Document and fix all these "normalize_..." functions
[docs]def normalize_unicode(value): # FIXME: i18n if (value is None) or (value is str('')): return '' elif value is True: return safe_decode('Sí') elif value is False: return safe_decode('No') else: return safe_decode(value)
[docs]def normalize_name(value): return capitalize(normalize_unicode(value))
[docs]def normalize_title(value): return capitalize(normalize_unicode(value), True)
[docs]def normalize_str(value): import re is_bytes = isinstance(value, bytes) regex, sep = r'(\S+)\s*', ' ' if is_bytes: regex, sep = bytes(regex), bytes(sep) regex = re.compile(regex) matches = regex.findall(value) names = (m.capitalize() if len(m) >= 3 else m.lower() for m in matches) return sep.join(names)
[docs]def normalize_ascii(value): '''Return the string normal form for the `value` Convert all non-ascii to valid characters using unicode 'NFKC' normalization. ''' import unicodedata from xoutil.eight import text_type if not isinstance(value, text_type): value = safe_decode(value) res = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') return safe_str(res)
[docs]def normalize_slug(value, replacement='-', invalids=None, valids=None): '''Return the string normal form, valid for slugs, for the `value` Convert all non-ascii to valid characters using unicode 'NFKC' normalization. Lower-case the result. Replace unwanted characters by `replacement`, repetition of given pattern will be converted to only one instance. .. warning:: There's a known bug when `replacement` contains '\'. ``[_a-z0-9]`` are assumed as valid characters. Extra arguments can modify this standard behaviour: :param invalids: Any collection of characters added to these that are normally invalid in the provided `value`. (non-ascii or not included in valid characters). Boolean ``True`` can be passed as a synonymous of ``"_"`` for compatibility with old ``invalid_underscore`` argument. ``False`` or ``None`` are assumed as an empty set for invalid characters. :param valids: A collection of extra valid characters (all non-ascii characters are ignored). This parameter could be either a valid string, any iterator of valid strings of characters, or ``None`` to use only default valid characters (See above). .. warning:: The result may contain characters in `invalids` if `replacements` does. Parameters `value` and `replacement` could be of any (non-string) type, these values are normalized and converted to lower-case ASCII strings. Examples:: >>> normalize_slug(' Á.e i Ó u ') == 'a-e-i-o-u' True >>> normalize_slug(' Á.e i Ó u ', '.', invalids='AU') == 'e.i.o' True >>> normalize_slug(' Á.e i Ó u ', valids='.') == 'a.e-i-o-u' True >>> normalize_slug('_x', '_') == '_x' True >>> normalize_slug('-x', '_') == 'x' True >>> normalize_slug(None) == 'none' True >>> normalize_slug(1 == 1) == 'true' True >>> normalize_slug(1.0) == '1-0' True >>> normalize_slug(135) == '135' True >>> normalize_slug(123456, '', invalids='52') == '1346' True >>> normalize_slug('_x', '_') == '_x' True .. versionchanged:: 1.5.5 Added the `invalid_underscore` parameter. .. versionchanged:: 1.6.6 Replaced the `invalid_underscore` paremeter by `invalids`. Added the `valids` parameter. .. versionchanged:: 1.7.2 Clarified the role of `invalids` with regards to `replacement`. ''' import re from xoutil.eight import string_types # local functions _normalize = lambda v: normalize_ascii(v).lower() _set = lambda v: ''.join(set(v)) _esc = lambda v: re.escape(_set(v)) _from_iter = lambda v: ''.join(i for i in v) # check and adjust arguments if replacement in (None, False): replacement = '' elif isinstance(replacement, string_types): replacement = normalize_ascii(replacement) # TODO: or _normalize? else: msg = '`replacement` (%s) must be a string or None, not `%s`.' raise TypeError(msg % (replacement, type(replacement))) if invalids is True: # Backward compatibility with former `invalid_underscore` argument invalids = '_' elif invalids in {None, False}: invalids = '' else: if not isinstance(invalids, string_types): invalids = _from_iter(invalids) invalids = _esc(_normalize(invalids)) if valids is None: valids = '' else: if not isinstance(valids, string_types): valids = _from_iter(valids) valids = _esc(re.sub(r'[0-9a-b]+', '', _normalize(valids))) # calculate result res = _normalize(value) regex = re.compile(r'[^_a-z0-9%s]+' % valids) repl = '\t' if replacement else '' res = regex.sub(repl, res) if invalids: regex = re.compile(r'[%s]+' % invalids) res = regex.sub(repl, res) if repl: r = {'r': r'%s' % re.escape(repl)} regex = re.compile(r'(%(r)s){2,}' % r) res = regex.sub(repl, res) regex = re.compile(r'(^%(r)s+|%(r)s+$)' % r) res = regex.sub('', res) regex = re.compile(r'[\t]' % r) res = regex.sub(replacement, res) return res
[docs]def strfnumber(number, format_spec='%0.2f'): res = format_spec % number if '.' in res: res = res.rstrip('0') if res.endswith('.'): res = res[:-1] return res
[docs]def parse_boolean(value): '''Parse a boolean from any value given a special treatment to strings. >>> parse_boolean('trUe') True >>> parse_boolean('faLSe') False ''' from xoutil.eight import string_types if isinstance(value, string_types): value = value.strip() if value: if value.isdigit(): return bool(int(value)) else: if isinstance(value, bytes): falses = (b'false', b'no', b'not') else: falses = ('false', 'no', 'not') return value.lower() not in falses else: return False else: return bool(value)
[docs]def parse_url_int(value, default=None): '''Parse an integer URL argument. Some operations treat simple arguments as a list of one element. ''' # TODO: Move to `xoutil.web` if isinstance(value, (list, tuple, set)) and len(value) > 0: value = value[0] try: return int(safe_strip(value)) except: return default
[docs]def error2str(error): '''Convert an error to string.''' from xoutil.eight import string_types from xoutil.types import type_coerce if isinstance(error, string_types): return safe_str(error) elif isinstance(error, BaseException): tname = type(error).__name__ res = safe_str(error) if tname in res: return res else: return str(': ').join(tname, res) if res else tname elif issubclass(error, BaseException): return type(error).__name__ else: prefix = str('unknown error: ') cls = type_coerce(error) tname = cls.__name__ if cls is error: res = tname else: res = safe_str(error) if tname not in res: res = str('{}({})').format(tname, res) if res else tname return prefix + res
[docs]def force_str(value, encoding=None): '''Force to string, the type is different in Python 2 or 3 (bytes or unicode). :param value: The value to convert to `str`. :param encoding: The encoding which should be used if either encoding or decoding should be performed on `value`. The default is to use the same default as :func:`safe_encode` or :func:`safe_decode`. .. versionadded:: 1.2.0 ''' if isinstance(value, str): return value elif str is bytes: # Python 2 return safe_encode(value, encoding) else: return safe_decode(value, encoding)
[docs]def make_a10z(string): '''Utility to find out that "internationalization" is "i18n". Examples:: >>> print(make_a10z('parametrization')) p13n ''' return string[0] + str(len(string[1:-1])) + string[-1]
from xoutil.eight import input input = _deprecated( input, "xoutil.string.input is deprecated. Use xoutil.eight.input" )(input)