관리-도구
편집 파일: newstr.py
""" This module redefines ``str`` on Python 2.x to be a subclass of the Py2 ``unicode`` type that behaves like the Python 3.x ``str``. The main differences between ``newstr`` and Python 2.x's ``unicode`` type are the stricter type-checking and absence of a `u''` prefix in the representation. It is designed to be used together with the ``unicode_literals`` import as follows: >>> from __future__ import unicode_literals >>> from builtins import str, isinstance On Python 3.x and normally on Python 2.x, these expressions hold >>> str('blah') is 'blah' True >>> isinstance('blah', str) True However, on Python 2.x, with this import: >>> from __future__ import unicode_literals the same expressions are False: >>> str('blah') is 'blah' False >>> isinstance('blah', str) False This module is designed to be imported together with ``unicode_literals`` on Python 2 to bring the meaning of ``str`` back into alignment with unprefixed string literals (i.e. ``unicode`` subclasses). Note that ``str()`` (and ``print()``) would then normally call the ``__unicode__`` method on objects in Python 2. To define string representations of your objects portably across Py3 and Py2, use the :func:`python_2_unicode_compatible` decorator in :mod:`future.utils`. """ from numbers import Number from future.utils import PY3, istext, with_metaclass, isnewbytes from future.types import no, issubset from future.types.newobject import newobject if PY3: # We'll probably never use newstr on Py3 anyway... unicode = str from collections.abc import Iterable else: from collections import Iterable class BaseNewStr(type): def __instancecheck__(cls, instance): if cls == newstr: return isinstance(instance, unicode) else: return issubclass(instance.__class__, cls) class newstr(with_metaclass(BaseNewStr, unicode)): """ A backport of the Python 3 str object to Py2 """ no_convert_msg = "Can't convert '{0}' object to str implicitly" def __new__(cls, *args, **kwargs): """ From the Py3 str docstring: str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'. """ if len(args) == 0: return super(newstr, cls).__new__(cls) # Special case: If someone requests str(str(u'abc')), return the same # object (same id) for consistency with Py3.3. This is not true for # other objects like list or dict. elif type(args[0]) == newstr and cls == newstr: return args[0] elif isinstance(args[0], unicode): value = args[0] elif isinstance(args[0], bytes): # i.e. Py2 bytes or newbytes if 'encoding' in kwargs or len(args) > 1: value = args[0].decode(*args[1:], **kwargs) else: value = args[0].__str__() else: value = args[0] return super(newstr, cls).__new__(cls, value) def __repr__(self): """ Without the u prefix """ value = super(newstr, self).__repr__() # assert value[0] == u'u' return value[1:] def __getitem__(self, y): """ Warning: Python <= 2.7.6 has a bug that causes this method never to be called when y is a slice object. Therefore the type of newstr()[:2] is wrong (unicode instead of newstr). """ return newstr(super(newstr, self).__getitem__(y)) def __contains__(self, key): errmsg = "'in <string>' requires string as left operand, not {0}" # Don't use isinstance() here because we only want to catch # newstr, not Python 2 unicode: if type(key) == newstr: newkey = key elif isinstance(key, unicode) or isinstance(key, bytes) and not isnewbytes(key): newkey = newstr(key) else: raise TypeError(errmsg.format(type(key))) return issubset(list(newkey), list(self)) @no('newbytes') def __add__(self, other): return newstr(super(newstr, self).__add__(other)) @no('newbytes') def __radd__(self, left): " left + self " try: return newstr(left) + self except: return NotImplemented def __mul__(self, other): return newstr(super(newstr, self).__mul__(other)) def __rmul__(self, other): return newstr(super(newstr, self).__rmul__(other)) def join(self, iterable): errmsg = 'sequence item {0}: expected unicode string, found bytes' for i, item in enumerate(iterable): # Here we use type() rather than isinstance() because # __instancecheck__ is being overridden. E.g. # isinstance(b'abc', newbytes) is True on Py2. if isnewbytes(item): raise TypeError(errmsg.format(i)) # Support use as a staticmethod: str.join('-', ['a', 'b']) if type(self) == newstr: return newstr(super(newstr, self).join(iterable)) else: return newstr(super(newstr, newstr(self)).join(iterable)) @no('newbytes') def find(self, sub, *args): return super(newstr, self).find(sub, *args) @no('newbytes') def rfind(self, sub, *args): return super(newstr, self).rfind(sub, *args) @no('newbytes', (1, 2)) def replace(self, old, new, *args): return newstr(super(newstr, self).replace(old, new, *args)) def decode(self, *args): raise AttributeError("decode method has been disabled in newstr") def encode(self, encoding='utf-8', errors='strict'): """ Returns bytes Encode S using the codec registered for encoding. Default encoding is 'utf-8'. errors may be given to set a different error handling scheme. Default is 'strict' meaning that encoding errors raise a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and 'xmlcharrefreplace' as well as any other name registered with codecs.register_error that can handle UnicodeEncodeErrors. """ from future.types.newbytes import newbytes # Py2 unicode.encode() takes encoding and errors as optional parameter, # not keyword arguments as in Python 3 str. # For the surrogateescape error handling mechanism, the # codecs.register_error() function seems to be inadequate for an # implementation of it when encoding. (Decoding seems fine, however.) # For example, in the case of # u'\udcc3'.encode('ascii', 'surrogateescape_handler') # after registering the ``surrogateescape_handler`` function in # future.utils.surrogateescape, both Python 2.x and 3.x raise an # exception anyway after the function is called because the unicode # string it has to return isn't encodable strictly as ASCII. if errors == 'surrogateescape': if encoding == 'utf-16': # Known to fail here. See test_encoding_works_normally() raise NotImplementedError('FIXME: surrogateescape handling is ' 'not yet implemented properly') # Encode char by char, building up list of byte-strings mybytes = [] for c in self: code = ord(c) if 0xD800 <= code <= 0xDCFF: mybytes.append(newbytes([code - 0xDC00])) else: mybytes.append(c.encode(encoding=encoding)) return newbytes(b'').join(mybytes) return newbytes(super(newstr, self).encode(encoding, errors)) @no('newbytes', 1) def startswith(self, prefix, *args): if isinstance(prefix, Iterable): for thing in prefix: if isnewbytes(thing): raise TypeError(self.no_convert_msg.format(type(thing))) return super(newstr, self).startswith(prefix, *args) @no('newbytes', 1) def endswith(self, prefix, *args): # Note we need the decorator above as well as the isnewbytes() # check because prefix can be either a bytes object or e.g. a # tuple of possible prefixes. (If it's a bytes object, each item # in it is an int.) if isinstance(prefix, Iterable): for thing in prefix: if isnewbytes(thing): raise TypeError(self.no_convert_msg.format(type(thing))) return super(newstr, self).endswith(prefix, *args) @no('newbytes', 1) def split(self, sep=None, maxsplit=-1): # Py2 unicode.split() takes maxsplit as an optional parameter, # not as a keyword argument as in Python 3 str. parts = super(newstr, self).split(sep, maxsplit) return [newstr(part) for part in parts] @no('newbytes', 1) def rsplit(self, sep=None, maxsplit=-1): # Py2 unicode.rsplit() takes maxsplit as an optional parameter, # not as a keyword argument as in Python 3 str. parts = super(newstr, self).rsplit(sep, maxsplit) return [newstr(part) for part in parts] @no('newbytes', 1) def partition(self, sep): parts = super(newstr, self).partition(sep) return tuple(newstr(part) for part in parts) @no('newbytes', 1) def rpartition(self, sep): parts = super(newstr, self).rpartition(sep) return tuple(newstr(part) for part in parts) @no('newbytes', 1) def index(self, sub, *args): """ Like newstr.find() but raise ValueError when the substring is not found. """ pos = self.find(sub, *args) if pos == -1: raise ValueError('substring not found') return pos def splitlines(self, keepends=False): """ S.splitlines(keepends=False) -> list of strings Return a list of the lines in S, breaking at line boundaries. Line breaks are not included in the resulting list unless keepends is given and true. """ # Py2 unicode.splitlines() takes keepends as an optional parameter, # not as a keyword argument as in Python 3 str. parts = super(newstr, self).splitlines(keepends) return [newstr(part) for part in parts] def __eq__(self, other): if (isinstance(other, unicode) or isinstance(other, bytes) and not isnewbytes(other)): return super(newstr, self).__eq__(other) else: return NotImplemented def __hash__(self): if (isinstance(self, unicode) or isinstance(self, bytes) and not isnewbytes(self)): return super(newstr, self).__hash__() else: raise NotImplementedError() def __ne__(self, other): if (isinstance(other, unicode) or isinstance(other, bytes) and not isnewbytes(other)): return super(newstr, self).__ne__(other) else: return True unorderable_err = 'unorderable types: str() and {0}' def __lt__(self, other): if (isinstance(other, unicode) or isinstance(other, bytes) and not isnewbytes(other)): return super(newstr, self).__lt__(other) raise TypeError(self.unorderable_err.format(type(other))) def __le__(self, other): if (isinstance(other, unicode) or isinstance(other, bytes) and not isnewbytes(other)): return super(newstr, self).__le__(other) raise TypeError(self.unorderable_err.format(type(other))) def __gt__(self, other): if (isinstance(other, unicode) or isinstance(other, bytes) and not isnewbytes(other)): return super(newstr, self).__gt__(other) raise TypeError(self.unorderable_err.format(type(other))) def __ge__(self, other): if (isinstance(other, unicode) or isinstance(other, bytes) and not isnewbytes(other)): return super(newstr, self).__ge__(other) raise TypeError(self.unorderable_err.format(type(other))) def __getattribute__(self, name): """ A trick to cause the ``hasattr`` builtin-fn to return False for the 'decode' method on Py2. """ if name in ['decode', u'decode']: raise AttributeError("decode method has been disabled in newstr") return super(newstr, self).__getattribute__(name) def __native__(self): """ A hook for the future.utils.native() function. """ return unicode(self) @staticmethod def maketrans(x, y=None, z=None): """ Return a translation table usable for str.translate(). If there is only one argument, it must be a dictionary mapping Unicode ordinals (integers) or characters to Unicode ordinals, strings or None. Character keys will be then converted to ordinals. If there are two arguments, they must be strings of equal length, and in the resulting dictionary, each character in x will be mapped to the character at the same position in y. If there is a third argument, it must be a string, whose characters will be mapped to None in the result. """ if y is None: assert z is None if not isinstance(x, dict): raise TypeError('if you give only one argument to maketrans it must be a dict') result = {} for (key, value) in x.items(): if len(key) > 1: raise ValueError('keys in translate table must be strings or integers') result[ord(key)] = value else: if not isinstance(x, unicode) and isinstance(y, unicode): raise TypeError('x and y must be unicode strings') if not len(x) == len(y): raise ValueError('the first two maketrans arguments must have equal length') result = {} for (xi, yi) in zip(x, y): if len(xi) > 1: raise ValueError('keys in translate table must be strings or integers') result[ord(xi)] = ord(yi) if z is not None: for char in z: result[ord(char)] = None return result def translate(self, table): """ S.translate(table) -> str Return a copy of the string S, where all characters have been mapped through the given translation table, which must be a mapping of Unicode ordinals to Unicode ordinals, strings, or None. Unmapped characters are left untouched. Characters mapped to None are deleted. """ l = [] for c in self: if ord(c) in table: val = table[ord(c)] if val is None: continue elif isinstance(val, unicode): l.append(val) else: l.append(chr(val)) else: l.append(c) return ''.join(l) def isprintable(self): raise NotImplementedError('fixme') def isidentifier(self): raise NotImplementedError('fixme') def format_map(self): raise NotImplementedError('fixme') __all__ = ['newstr']