Source code for xotl.tools.string

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ----------------------------------------------------------------------
# Copyright (c) Merchise Autrement [~º/~] and Contributors
# All rights reserved.
#
# This is free software; you can do what the LICENCE file allows you to.
#
"""Some additions for `string` standard module.

In Python 3 `str` is always unicode but `unicode` and `basestring` types
doesn't exists.  `bytes` type can be used as an array of one byte each item.

"""
from typing import Any, Optional, Pattern

from xotl.tools.deprecation import deprecated  # noqa
from xotl.tools.deprecation import import_deprecated  # noqa


_MIGRATED_TO_CODECS = ("force_encoding", "safe_decode", "safe_encode")

import_deprecated("xotl.tools.future.codecs", *_MIGRATED_TO_CODECS)


@deprecated
def safe_strip(value):
    """Removes the leading and tailing space-chars from `value` if string, else
    return `value` unchanged.

    """
    return value.strip() if isinstance(value, str) else value


# TODO: Functions starting with 'cut_' must be reviewed, maybe migrated to
# some module dedicated to "string trimming".
try:
    cut_prefix = str.removeprefix
except AttributeError:

[docs] def cut_prefix(self: str, prefix: str) -> str: """Removes the leading `prefix` if exists, else return `value` unchanged. In Python 3.9+ this is the same as `str.removeprefix`:func:. """ from xotl.tools.future.codecs import safe_encode, safe_decode if isinstance(self, str) and isinstance(prefix, bytes): prefix = safe_decode(prefix) elif isinstance(self, bytes) and isinstance(prefix, str): prefix = safe_encode(prefix) return self[len(prefix) :] if self.startswith(prefix) else self
[docs]def cut_any_prefix(value: str, *prefixes: str) -> str: """Apply `cut_prefix`:func: for the first matching prefix.""" result = prev = value i, top = 0, len(prefixes) while i < top and result == prev: prefix, i = prefixes[i], i + 1 prev, result = result, cut_prefix(prev, prefix) return result
[docs]def cut_prefixes(value: str, *prefixes: str) -> str: """Apply `cut_prefix`:func: for all provided prefixes in order.""" result = value for prefix in prefixes: result = cut_prefix(result, prefix) return result
try: cut_suffix = str.removesuffix except AttributeError:
[docs] def cut_suffix(self: str, suffix: str) -> str: """Removes the tailing `suffix` if exists, else return `value` unchanged. In Python 3.9+ this is the same as `str.removesuffix`:func:. """ from xotl.tools.future.codecs import safe_decode, safe_encode if isinstance(self, str) and isinstance(suffix, bytes): suffix = safe_decode(suffix) elif isinstance(self, bytes) and isinstance(suffix, str): suffix = safe_encode(suffix) # Since value.endswith('') is always true but value[:-0] is actually # always value[:0], which is always '', we have to explictly test for # len(suffix) if len(suffix) > 0 and self.endswith(suffix): return self[: -len(suffix)] else: return self
[docs]def cut_any_suffix(value: str, *suffixes: str) -> str: """Apply `cut_suffix`:func: for the first matching suffix.""" result = prev = value i, top = 0, len(suffixes) while i < top and result == prev: suffix, i = suffixes[i], i + 1 prev, result = result, cut_suffix(prev, suffix) return result
[docs]def cut_suffixes(value: str, *suffixes: str) -> str: """Apply `cut_suffix`:func: for all provided suffixes in order.""" result = value for suffix in suffixes: result = cut_suffix(result, suffix) return result
def force_ascii(value: Any, encoding: str = None) -> str: """Return the string normal form for the `value` Convert all non-ascii to valid characters using unicode 'NFKC' normalization. :param encoding: If `value` is not unicode, it is decoded before ASCII normalization using this encoding. If not provided use the return of `~xotl.tools.future.codecs.force_encoding`:func:. .. versionchanged:: 1.8.7 Add parameter 'encoding'. .. versionchanged:: 2.1.0 Moved to `xotl.tools.string`:mod:. """ import unicodedata from .future.codecs import safe_decode ASCII, IGNORE = "ascii", "ignore" if not isinstance(value, str): value = safe_decode(value, encoding=encoding) res = unicodedata.normalize("NFKD", value).encode(ASCII, IGNORE) return str(res, ASCII, IGNORE)
[docs]def slugify(value: Any, *args, **kwds) -> str: """Return the normal-form of a given string value that is valid for slugs. Convert all non-ascii to valid characters, whenever possible, using unicode 'NFKC' normalization and lower-case the result. Replace unwanted characters by the value of `replacement` (remove extra when repeated). Default valid characters are ``[_a-z0-9]``. Extra arguments `invalid_chars` and `valid_chars` can modify this standard behaviour, see next: :param value: The source value to slugify. :param replacement: A character to be used as replacement for unwanted characters. Could be both, the first extra positional argument, or as a keyword argument. Default value is a hyphen ('-'). There will be a contradiction if this argument contains any invalid character (see `invalid_chars`). ``None``, or ``False``, will be converted converted to an empty string for backward compatibility with old versions of this function, but not use this, will be deprecated. :param invalid_chars: Characters to be considered invalid. There is a default set of valid characters which are kept in the resulting slug. Characters given in this parameter are removed from the resulting valid character set (see `valid_chars`). Extra argument values can be used for compatibility with `invalid_underscore` argument in deprecated `normalize_slug` function: - ``True`` is a synonymous of underscore ``"_"``. - ``False`` or ``None``: An empty set. Could be given as a name argument or in the second extra positional argument. Default value is an empty set. :param valid_chars: A collection of extra valid characters. Could be either a valid string, any iterator of strings, or ``None`` to use only default valid characters. Non-ASCII characters are ignored. :param encoding: If `value` is not a text (unicode), it is decoded before `ASCII normalization <force_ascii>`:func:. Examples:: >>> slugify(' Á.e i Ó u ') == 'a-e-i-o-u' True >>> slugify(' Á.e i Ó u ', '.', invalid_chars='AU') == 'e.i.o' True >>> slugify(' Á.e i Ó u ', valid_chars='.') == 'a.e-i-o-u' True >>> slugify('_x', '_') == '_x' True >>> slugify('-x', '_') == 'x' True >>> slugify(None) == 'none' True >>> slugify(1 == 1) == 'true' True >>> slugify(1.0) == '1-0' True >>> slugify(135) == '135' True >>> slugify(123456, '', invalid_chars='52') == '1346' True >>> slugify('_x', '_') == '_x' True .. versionchanged:: 1.5.5 Added the `invalid_underscore` parameter. .. versionchanged:: 1.6.6 Replaced the `invalid_underscore` paremeter by `invalids`. Added the `valids` parameter. .. versionchanged:: 1.7.2 Clarified the role of `invalids` with regards to `replacement`. .. versionchanged:: 1.8.0 Deprecate the `invalids` paremeter name in favor of `invalid_chars`, also deprecate the `valids` paremeter name in favor of `valid_chars`. .. versionchanged:: 1.8.7 Add parameter 'encoding'. .. versionchanged:: 2.1.0 Remove deprecated parameters `invalids` and `valids`. """ import re from .params import ParamManager from .values import compose, istype from .values.simple import not_false, ascii_coerce _str = compose(not_false(""), istype(str)) _ascii = compose(_str, ascii_coerce) # local functions def _normalize(v): return force_ascii(v, encoding=encoding).lower() def _set(v): return re.escape("".join(set(_normalize(v)))) getarg = ParamManager(args, kwds) replacement = getarg("replacement", 0, default="-", coercers=(str,)) invalid_chars = getarg("invalid_chars", "invalid", 0, default="", coercers=_ascii) valid_chars = getarg("valid_chars", "valid", 0, default="", coercers=_ascii) encoding = getarg("encoding", default=None) replacement = args[0] if args else kwds.pop("replacement", "-") # TODO: check unnecessary arguments, raising errors if replacement in (None, False): # for backward compatibility replacement = "" elif isinstance(replacement, str): replacement = _normalize(replacement) else: raise TypeError( 'slugify() replacement "{}" must be a string or None,' ' not "{}".'.format(replacement, type(replacement)) ) if invalid_chars is True: # Backward compatibility with former `invalid_underscore` argument invalid_chars = "_" elif invalid_chars in {None, False}: invalid_chars = "" else: if not isinstance(invalid_chars, str): invalid_chars = "".join(invalid_chars) invalid_chars = _set(invalid_chars) invalid_regex: Optional[Pattern] if invalid_chars: invalid_regex = re.compile(r"[{}]+".format(invalid_chars)) if invalid_regex.search(replacement): raise ValueError( 'slugify() replacement "{}" must not contain ' "any invalid character.".format(replacement) ) else: invalid_regex = None if valid_chars is None: valid_chars = "" else: if not isinstance(valid_chars, str): valid_chars = "".join(valid_chars) valid_chars = _set(valid_chars) valid_chars = _set(re.sub(r"[0-9a-z]+", "", valid_chars)) valid_chars = re.compile(r"[^_0-9a-z{}]+".format(valid_chars)) # calculate result repl = "\t" if replacement else "" res = valid_chars.sub(repl, _normalize(value)) if invalid_regex: res = invalid_regex.sub(repl, res) if repl: # convert two or more replacements in only one instance r = r"{}".format(re.escape(repl)) res = re.sub(r"({r}){{2,}}".format(r=r), repl, res) # remove start and end more replacement instances res = re.sub(r"(^{r}+|{r}+$)".format(r=r), "", res) res = re.sub(r"[\t]", replacement, res) return res
[docs]def error2str(error): """Convert an error to string.""" if isinstance(error, str): return error elif isinstance(error, BaseException): tname = type(error).__name__ res = str(error) if tname in res: return res else: return ": ".join((tname, res)) if res else tname elif isinstance(error, type) and issubclass(error, BaseException): return error.__name__ else: prefix = str("unknown error: ") cls = error if isinstance(error, type) else type(error) # force type tname = cls.__name__ if cls is error: res = tname else: res = str(error) if tname not in res: res = str("{}({})").format(tname, res) if res else tname return prefix + res
[docs]def make_a10z(string: str) -> str: """Utility to find out that "internationalization" is "i18n". Examples:: >>> print(make_a10z('parametrization')) p13n """ return string[0] + str(len(string[1:-1])) + string[-1]
[docs]@deprecated(slugify) def normalize_slug(value: Any, *args, **kwds) -> str: return slugify(value, *args, **kwds)
del deprecated, import_deprecated