blob: e6ad6e3a0523643fab643798d818e5e4927217d6 [file] [log] [blame]
Yi Kong83283012023-12-13 12:57:00 +09001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""
9
10import builtins
11import sys
12
13### Registry and builtin stateless codec functions
14
15try:
16 from _codecs import *
17except ImportError as why:
18 raise SystemError('Failed to load the builtin codecs: %s' % why)
19
20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
25 "StreamReader", "StreamWriter",
26 "StreamReaderWriter", "StreamRecoder",
27 "getencoder", "getdecoder", "getincrementalencoder",
28 "getincrementaldecoder", "getreader", "getwriter",
29 "encode", "decode", "iterencode", "iterdecode",
30 "strict_errors", "ignore_errors", "replace_errors",
31 "xmlcharrefreplace_errors",
32 "backslashreplace_errors", "namereplace_errors",
33 "register_error", "lookup_error"]
34
35### Constants
36
37#
38# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
39# and its possible byte string values
40# for UTF8/UTF16/UTF32 output and little/big endian machines
41#
42
43# UTF-8
44BOM_UTF8 = b'\xef\xbb\xbf'
45
46# UTF-16, little endian
47BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
48
49# UTF-16, big endian
50BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
51
52# UTF-32, little endian
53BOM_UTF32_LE = b'\xff\xfe\x00\x00'
54
55# UTF-32, big endian
56BOM_UTF32_BE = b'\x00\x00\xfe\xff'
57
58if sys.byteorder == 'little':
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_LE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_LE
65
66else:
67
68 # UTF-16, native endianness
69 BOM = BOM_UTF16 = BOM_UTF16_BE
70
71 # UTF-32, native endianness
72 BOM_UTF32 = BOM_UTF32_BE
73
74# Old broken names (don't use in new code)
75BOM32_LE = BOM_UTF16_LE
76BOM32_BE = BOM_UTF16_BE
77BOM64_LE = BOM_UTF32_LE
78BOM64_BE = BOM_UTF32_BE
79
80
81### Codec base classes (defining the API)
82
83class CodecInfo(tuple):
84 """Codec details when looking up the codec registry"""
85
86 # Private API to allow Python 3.4 to denylist the known non-Unicode
87 # codecs in the standard library. A more general mechanism to
88 # reliably distinguish test encodings from other codecs will hopefully
89 # be defined for Python 3.5
90 #
91 # See http://bugs.python.org/issue19619
92 _is_text_encoding = True # Assume codecs are text encodings by default
93
94 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
95 incrementalencoder=None, incrementaldecoder=None, name=None,
96 *, _is_text_encoding=None):
97 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
98 self.name = name
99 self.encode = encode
100 self.decode = decode
101 self.incrementalencoder = incrementalencoder
102 self.incrementaldecoder = incrementaldecoder
103 self.streamwriter = streamwriter
104 self.streamreader = streamreader
105 if _is_text_encoding is not None:
106 self._is_text_encoding = _is_text_encoding
107 return self
108
109 def __repr__(self):
110 return "<%s.%s object for encoding %s at %#x>" % \
111 (self.__class__.__module__, self.__class__.__qualname__,
112 self.name, id(self))
113
114class Codec:
115
116 """ Defines the interface for stateless encoders/decoders.
117
118 The .encode()/.decode() methods may use different error
119 handling schemes by providing the errors argument. These
120 string values are predefined:
121
122 'strict' - raise a ValueError error (or a subclass)
123 'ignore' - ignore the character and continue with the next
124 'replace' - replace with a suitable replacement character;
125 Python will use the official U+FFFD REPLACEMENT
126 CHARACTER for the builtin Unicode codecs on
127 decoding and '?' on encoding.
128 'surrogateescape' - replace with private code points U+DCnn.
129 'xmlcharrefreplace' - Replace with the appropriate XML
130 character reference (only for encoding).
131 'backslashreplace' - Replace with backslashed escape sequences.
132 'namereplace' - Replace with \\N{...} escape sequences
133 (only for encoding).
134
135 The set of allowed values can be extended via register_error.
136
137 """
138 def encode(self, input, errors='strict'):
139
140 """ Encodes the object input and returns a tuple (output
141 object, length consumed).
142
143 errors defines the error handling to apply. It defaults to
144 'strict' handling.
145
146 The method may not store state in the Codec instance. Use
147 StreamWriter for codecs which have to keep state in order to
148 make encoding efficient.
149
150 The encoder must be able to handle zero length input and
151 return an empty object of the output object type in this
152 situation.
153
154 """
155 raise NotImplementedError
156
157 def decode(self, input, errors='strict'):
158
159 """ Decodes the object input and returns a tuple (output
160 object, length consumed).
161
162 input must be an object which provides the bf_getreadbuf
163 buffer slot. Python strings, buffer objects and memory
164 mapped files are examples of objects providing this slot.
165
166 errors defines the error handling to apply. It defaults to
167 'strict' handling.
168
169 The method may not store state in the Codec instance. Use
170 StreamReader for codecs which have to keep state in order to
171 make decoding efficient.
172
173 The decoder must be able to handle zero length input and
174 return an empty object of the output object type in this
175 situation.
176
177 """
178 raise NotImplementedError
179
180class IncrementalEncoder(object):
181 """
182 An IncrementalEncoder encodes an input in multiple steps. The input can
183 be passed piece by piece to the encode() method. The IncrementalEncoder
184 remembers the state of the encoding process between calls to encode().
185 """
186 def __init__(self, errors='strict'):
187 """
188 Creates an IncrementalEncoder instance.
189
190 The IncrementalEncoder may use different error handling schemes by
191 providing the errors keyword argument. See the module docstring
192 for a list of possible values.
193 """
194 self.errors = errors
195 self.buffer = ""
196
197 def encode(self, input, final=False):
198 """
199 Encodes input and returns the resulting object.
200 """
201 raise NotImplementedError
202
203 def reset(self):
204 """
205 Resets the encoder to the initial state.
206 """
207
208 def getstate(self):
209 """
210 Return the current state of the encoder.
211 """
212 return 0
213
214 def setstate(self, state):
215 """
216 Set the current state of the encoder. state must have been
217 returned by getstate().
218 """
219
220class BufferedIncrementalEncoder(IncrementalEncoder):
221 """
222 This subclass of IncrementalEncoder can be used as the baseclass for an
223 incremental encoder if the encoder must keep some of the output in a
224 buffer between calls to encode().
225 """
226 def __init__(self, errors='strict'):
227 IncrementalEncoder.__init__(self, errors)
228 # unencoded input that is kept between calls to encode()
229 self.buffer = ""
230
231 def _buffer_encode(self, input, errors, final):
232 # Overwrite this method in subclasses: It must encode input
233 # and return an (output, length consumed) tuple
234 raise NotImplementedError
235
236 def encode(self, input, final=False):
237 # encode input (taking the buffer into account)
238 data = self.buffer + input
239 (result, consumed) = self._buffer_encode(data, self.errors, final)
240 # keep unencoded input until the next call
241 self.buffer = data[consumed:]
242 return result
243
244 def reset(self):
245 IncrementalEncoder.reset(self)
246 self.buffer = ""
247
248 def getstate(self):
249 return self.buffer or 0
250
251 def setstate(self, state):
252 self.buffer = state or ""
253
254class IncrementalDecoder(object):
255 """
256 An IncrementalDecoder decodes an input in multiple steps. The input can
257 be passed piece by piece to the decode() method. The IncrementalDecoder
258 remembers the state of the decoding process between calls to decode().
259 """
260 def __init__(self, errors='strict'):
261 """
262 Create an IncrementalDecoder instance.
263
264 The IncrementalDecoder may use different error handling schemes by
265 providing the errors keyword argument. See the module docstring
266 for a list of possible values.
267 """
268 self.errors = errors
269
270 def decode(self, input, final=False):
271 """
272 Decode input and returns the resulting object.
273 """
274 raise NotImplementedError
275
276 def reset(self):
277 """
278 Reset the decoder to the initial state.
279 """
280
281 def getstate(self):
282 """
283 Return the current state of the decoder.
284
285 This must be a (buffered_input, additional_state_info) tuple.
286 buffered_input must be a bytes object containing bytes that
287 were passed to decode() that have not yet been converted.
288 additional_state_info must be a non-negative integer
289 representing the state of the decoder WITHOUT yet having
290 processed the contents of buffered_input. In the initial state
291 and after reset(), getstate() must return (b"", 0).
292 """
293 return (b"", 0)
294
295 def setstate(self, state):
296 """
297 Set the current state of the decoder.
298
299 state must have been returned by getstate(). The effect of
300 setstate((b"", 0)) must be equivalent to reset().
301 """
302
303class BufferedIncrementalDecoder(IncrementalDecoder):
304 """
305 This subclass of IncrementalDecoder can be used as the baseclass for an
306 incremental decoder if the decoder must be able to handle incomplete
307 byte sequences.
308 """
309 def __init__(self, errors='strict'):
310 IncrementalDecoder.__init__(self, errors)
311 # undecoded input that is kept between calls to decode()
312 self.buffer = b""
313
314 def _buffer_decode(self, input, errors, final):
315 # Overwrite this method in subclasses: It must decode input
316 # and return an (output, length consumed) tuple
317 raise NotImplementedError
318
319 def decode(self, input, final=False):
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
325 return result
326
327 def reset(self):
328 IncrementalDecoder.reset(self)
329 self.buffer = b""
330
331 def getstate(self):
332 # additional state info is always 0
333 return (self.buffer, 0)
334
335 def setstate(self, state):
336 # ignore additional state info
337 self.buffer = state[0]
338
339#
340# The StreamWriter and StreamReader class provide generic working
341# interfaces which can be used to implement new encoding submodules
342# very easily. See encodings/utf_8.py for an example on how this is
343# done.
344#
345
346class StreamWriter(Codec):
347
348 def __init__(self, stream, errors='strict'):
349
350 """ Creates a StreamWriter instance.
351
352 stream must be a file-like object open for writing.
353
354 The StreamWriter may use different error handling
355 schemes by providing the errors keyword argument. These
356 parameters are predefined:
357
358 'strict' - raise a ValueError (or a subclass)
359 'ignore' - ignore the character and continue with the next
360 'replace'- replace with a suitable replacement character
361 'xmlcharrefreplace' - Replace with the appropriate XML
362 character reference.
363 'backslashreplace' - Replace with backslashed escape
364 sequences.
365 'namereplace' - Replace with \\N{...} escape sequences.
366
367 The set of allowed parameter values can be extended via
368 register_error.
369 """
370 self.stream = stream
371 self.errors = errors
372
373 def write(self, object):
374
375 """ Writes the object's contents encoded to self.stream.
376 """
377 data, consumed = self.encode(object, self.errors)
378 self.stream.write(data)
379
380 def writelines(self, list):
381
382 """ Writes the concatenated list of strings to the stream
383 using .write().
384 """
385 self.write(''.join(list))
386
387 def reset(self):
388
389 """ Resets the codec buffers used for keeping internal state.
390
391 Calling this method should ensure that the data on the
392 output is put into a clean state, that allows appending
393 of new fresh data without having to rescan the whole
394 stream to recover state.
395
396 """
397 pass
398
399 def seek(self, offset, whence=0):
400 self.stream.seek(offset, whence)
401 if whence == 0 and offset == 0:
402 self.reset()
403
404 def __getattr__(self, name,
405 getattr=getattr):
406
407 """ Inherit all other methods from the underlying stream.
408 """
409 return getattr(self.stream, name)
410
411 def __enter__(self):
412 return self
413
414 def __exit__(self, type, value, tb):
415 self.stream.close()
416
417###
418
419class StreamReader(Codec):
420
421 charbuffertype = str
422
423 def __init__(self, stream, errors='strict'):
424
425 """ Creates a StreamReader instance.
426
427 stream must be a file-like object open for reading.
428
429 The StreamReader may use different error handling
430 schemes by providing the errors keyword argument. These
431 parameters are predefined:
432
433 'strict' - raise a ValueError (or a subclass)
434 'ignore' - ignore the character and continue with the next
435 'replace'- replace with a suitable replacement character
436 'backslashreplace' - Replace with backslashed escape sequences;
437
438 The set of allowed parameter values can be extended via
439 register_error.
440 """
441 self.stream = stream
442 self.errors = errors
443 self.bytebuffer = b""
444 self._empty_charbuffer = self.charbuffertype()
445 self.charbuffer = self._empty_charbuffer
446 self.linebuffer = None
447
448 def decode(self, input, errors='strict'):
449 raise NotImplementedError
450
451 def read(self, size=-1, chars=-1, firstline=False):
452
453 """ Decodes data from the stream self.stream and returns the
454 resulting object.
455
456 chars indicates the number of decoded code points or bytes to
457 return. read() will never return more data than requested,
458 but it might return less, if there is not enough available.
459
460 size indicates the approximate maximum number of decoded
461 bytes or code points to read for decoding. The decoder
462 can modify this setting as appropriate. The default value
463 -1 indicates to read and decode as much as possible. size
464 is intended to prevent having to decode huge files in one
465 step.
466
467 If firstline is true, and a UnicodeDecodeError happens
468 after the first line terminator in the input only the first line
469 will be returned, the rest of the input will be kept until the
470 next call to read().
471
472 The method should use a greedy read strategy, meaning that
473 it should read as much data as is allowed within the
474 definition of the encoding and the given size, e.g. if
475 optional encoding endings or state markers are available
476 on the stream, these should be read too.
477 """
478 # If we have lines cached, first merge them back into characters
479 if self.linebuffer:
480 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
481 self.linebuffer = None
482
483 if chars < 0:
484 # For compatibility with other read() methods that take a
485 # single argument
486 chars = size
487
488 # read until we get the required number of characters (if available)
489 while True:
490 # can the request be satisfied from the character buffer?
491 if chars >= 0:
492 if len(self.charbuffer) >= chars:
493 break
494 # we need more data
495 if size < 0:
496 newdata = self.stream.read()
497 else:
498 newdata = self.stream.read(size)
499 # decode bytes (those remaining from the last call included)
500 data = self.bytebuffer + newdata
501 if not data:
502 break
503 try:
504 newchars, decodedbytes = self.decode(data, self.errors)
505 except UnicodeDecodeError as exc:
506 if firstline:
507 newchars, decodedbytes = \
508 self.decode(data[:exc.start], self.errors)
509 lines = newchars.splitlines(keepends=True)
510 if len(lines)<=1:
511 raise
512 else:
513 raise
514 # keep undecoded bytes until the next call
515 self.bytebuffer = data[decodedbytes:]
516 # put new characters in the character buffer
517 self.charbuffer += newchars
518 # there was no data available
519 if not newdata:
520 break
521 if chars < 0:
522 # Return everything we've got
523 result = self.charbuffer
524 self.charbuffer = self._empty_charbuffer
525 else:
526 # Return the first chars characters
527 result = self.charbuffer[:chars]
528 self.charbuffer = self.charbuffer[chars:]
529 return result
530
531 def readline(self, size=None, keepends=True):
532
533 """ Read one line from the input stream and return the
534 decoded data.
535
536 size, if given, is passed as size argument to the
537 read() method.
538
539 """
540 # If we have lines cached from an earlier read, return
541 # them unconditionally
542 if self.linebuffer:
543 line = self.linebuffer[0]
544 del self.linebuffer[0]
545 if len(self.linebuffer) == 1:
546 # revert to charbuffer mode; we might need more data
547 # next time
548 self.charbuffer = self.linebuffer[0]
549 self.linebuffer = None
550 if not keepends:
551 line = line.splitlines(keepends=False)[0]
552 return line
553
554 readsize = size or 72
555 line = self._empty_charbuffer
556 # If size is given, we call read() only once
557 while True:
558 data = self.read(readsize, firstline=True)
559 if data:
560 # If we're at a "\r" read one extra character (which might
561 # be a "\n") to get a proper line ending. If the stream is
562 # temporarily exhausted we return the wrong line ending.
563 if (isinstance(data, str) and data.endswith("\r")) or \
564 (isinstance(data, bytes) and data.endswith(b"\r")):
565 data += self.read(size=1, chars=1)
566
567 line += data
568 lines = line.splitlines(keepends=True)
569 if lines:
570 if len(lines) > 1:
571 # More than one line result; the first line is a full line
572 # to return
573 line = lines[0]
574 del lines[0]
575 if len(lines) > 1:
576 # cache the remaining lines
577 lines[-1] += self.charbuffer
578 self.linebuffer = lines
579 self.charbuffer = None
580 else:
581 # only one remaining line, put it back into charbuffer
582 self.charbuffer = lines[0] + self.charbuffer
583 if not keepends:
584 line = line.splitlines(keepends=False)[0]
585 break
586 line0withend = lines[0]
587 line0withoutend = lines[0].splitlines(keepends=False)[0]
588 if line0withend != line0withoutend: # We really have a line end
589 # Put the rest back together and keep it until the next call
590 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
591 self.charbuffer
592 if keepends:
593 line = line0withend
594 else:
595 line = line0withoutend
596 break
597 # we didn't get anything or this was our only try
598 if not data or size is not None:
599 if line and not keepends:
600 line = line.splitlines(keepends=False)[0]
601 break
602 if readsize < 8000:
603 readsize *= 2
604 return line
605
606 def readlines(self, sizehint=None, keepends=True):
607
608 """ Read all lines available on the input stream
609 and return them as a list.
610
611 Line breaks are implemented using the codec's decoder
612 method and are included in the list entries.
613
614 sizehint, if given, is ignored since there is no efficient
615 way to finding the true end-of-line.
616
617 """
618 data = self.read()
619 return data.splitlines(keepends)
620
621 def reset(self):
622
623 """ Resets the codec buffers used for keeping internal state.
624
625 Note that no stream repositioning should take place.
626 This method is primarily intended to be able to recover
627 from decoding errors.
628
629 """
630 self.bytebuffer = b""
631 self.charbuffer = self._empty_charbuffer
632 self.linebuffer = None
633
634 def seek(self, offset, whence=0):
635 """ Set the input stream's current position.
636
637 Resets the codec buffers used for keeping state.
638 """
639 self.stream.seek(offset, whence)
640 self.reset()
641
642 def __next__(self):
643
644 """ Return the next decoded line from the input stream."""
645 line = self.readline()
646 if line:
647 return line
648 raise StopIteration
649
650 def __iter__(self):
651 return self
652
653 def __getattr__(self, name,
654 getattr=getattr):
655
656 """ Inherit all other methods from the underlying stream.
657 """
658 return getattr(self.stream, name)
659
660 def __enter__(self):
661 return self
662
663 def __exit__(self, type, value, tb):
664 self.stream.close()
665
666###
667
668class StreamReaderWriter:
669
670 """ StreamReaderWriter instances allow wrapping streams which
671 work in both read and write modes.
672
673 The design is such that one can use the factory functions
674 returned by the codec.lookup() function to construct the
675 instance.
676
677 """
678 # Optional attributes set by the file wrappers below
679 encoding = 'unknown'
680
681 def __init__(self, stream, Reader, Writer, errors='strict'):
682
683 """ Creates a StreamReaderWriter instance.
684
685 stream must be a Stream-like object.
686
687 Reader, Writer must be factory functions or classes
688 providing the StreamReader, StreamWriter interface resp.
689
690 Error handling is done in the same way as defined for the
691 StreamWriter/Readers.
692
693 """
694 self.stream = stream
695 self.reader = Reader(stream, errors)
696 self.writer = Writer(stream, errors)
697 self.errors = errors
698
699 def read(self, size=-1):
700
701 return self.reader.read(size)
702
703 def readline(self, size=None):
704
705 return self.reader.readline(size)
706
707 def readlines(self, sizehint=None):
708
709 return self.reader.readlines(sizehint)
710
711 def __next__(self):
712
713 """ Return the next decoded line from the input stream."""
714 return next(self.reader)
715
716 def __iter__(self):
717 return self
718
719 def write(self, data):
720
721 return self.writer.write(data)
722
723 def writelines(self, list):
724
725 return self.writer.writelines(list)
726
727 def reset(self):
728
729 self.reader.reset()
730 self.writer.reset()
731
732 def seek(self, offset, whence=0):
733 self.stream.seek(offset, whence)
734 self.reader.reset()
735 if whence == 0 and offset == 0:
736 self.writer.reset()
737
738 def __getattr__(self, name,
739 getattr=getattr):
740
741 """ Inherit all other methods from the underlying stream.
742 """
743 return getattr(self.stream, name)
744
745 # these are needed to make "with StreamReaderWriter(...)" work properly
746
747 def __enter__(self):
748 return self
749
750 def __exit__(self, type, value, tb):
751 self.stream.close()
752
753###
754
755class StreamRecoder:
756
757 """ StreamRecoder instances translate data from one encoding to another.
758
759 They use the complete set of APIs returned by the
760 codecs.lookup() function to implement their task.
761
762 Data written to the StreamRecoder is first decoded into an
763 intermediate format (depending on the "decode" codec) and then
764 written to the underlying stream using an instance of the provided
765 Writer class.
766
767 In the other direction, data is read from the underlying stream using
768 a Reader instance and then encoded and returned to the caller.
769
770 """
771 # Optional attributes set by the file wrappers below
772 data_encoding = 'unknown'
773 file_encoding = 'unknown'
774
775 def __init__(self, stream, encode, decode, Reader, Writer,
776 errors='strict'):
777
778 """ Creates a StreamRecoder instance which implements a two-way
779 conversion: encode and decode work on the frontend (the
780 data visible to .read() and .write()) while Reader and Writer
781 work on the backend (the data in stream).
782
783 You can use these objects to do transparent
784 transcodings from e.g. latin-1 to utf-8 and back.
785
786 stream must be a file-like object.
787
788 encode and decode must adhere to the Codec interface; Reader and
789 Writer must be factory functions or classes providing the
790 StreamReader and StreamWriter interfaces resp.
791
792 Error handling is done in the same way as defined for the
793 StreamWriter/Readers.
794
795 """
796 self.stream = stream
797 self.encode = encode
798 self.decode = decode
799 self.reader = Reader(stream, errors)
800 self.writer = Writer(stream, errors)
801 self.errors = errors
802
803 def read(self, size=-1):
804
805 data = self.reader.read(size)
806 data, bytesencoded = self.encode(data, self.errors)
807 return data
808
809 def readline(self, size=None):
810
811 if size is None:
812 data = self.reader.readline()
813 else:
814 data = self.reader.readline(size)
815 data, bytesencoded = self.encode(data, self.errors)
816 return data
817
818 def readlines(self, sizehint=None):
819
820 data = self.reader.read()
821 data, bytesencoded = self.encode(data, self.errors)
822 return data.splitlines(keepends=True)
823
824 def __next__(self):
825
826 """ Return the next decoded line from the input stream."""
827 data = next(self.reader)
828 data, bytesencoded = self.encode(data, self.errors)
829 return data
830
831 def __iter__(self):
832 return self
833
834 def write(self, data):
835
836 data, bytesdecoded = self.decode(data, self.errors)
837 return self.writer.write(data)
838
839 def writelines(self, list):
840
841 data = b''.join(list)
842 data, bytesdecoded = self.decode(data, self.errors)
843 return self.writer.write(data)
844
845 def reset(self):
846
847 self.reader.reset()
848 self.writer.reset()
849
850 def seek(self, offset, whence=0):
851 # Seeks must be propagated to both the readers and writers
852 # as they might need to reset their internal buffers.
853 self.reader.seek(offset, whence)
854 self.writer.seek(offset, whence)
855
856 def __getattr__(self, name,
857 getattr=getattr):
858
859 """ Inherit all other methods from the underlying stream.
860 """
861 return getattr(self.stream, name)
862
863 def __enter__(self):
864 return self
865
866 def __exit__(self, type, value, tb):
867 self.stream.close()
868
869### Shortcuts
870
871def open(filename, mode='r', encoding=None, errors='strict', buffering=-1):
872
873 """ Open an encoded file using the given mode and return
874 a wrapped version providing transparent encoding/decoding.
875
876 Note: The wrapped version will only accept the object format
877 defined by the codecs, i.e. Unicode objects for most builtin
878 codecs. Output is also codec dependent and will usually be
879 Unicode as well.
880
881 Underlying encoded files are always opened in binary mode.
882 The default file mode is 'r', meaning to open the file in read mode.
883
884 encoding specifies the encoding which is to be used for the
885 file.
886
887 errors may be given to define the error handling. It defaults
888 to 'strict' which causes ValueErrors to be raised in case an
889 encoding error occurs.
890
891 buffering has the same meaning as for the builtin open() API.
892 It defaults to -1 which means that the default buffer size will
893 be used.
894
895 The returned wrapped file object provides an extra attribute
896 .encoding which allows querying the used encoding. This
897 attribute is only available if an encoding was specified as
898 parameter.
899
900 """
901 if encoding is not None and \
902 'b' not in mode:
903 # Force opening of the file in binary mode
904 mode = mode + 'b'
905 file = builtins.open(filename, mode, buffering)
906 if encoding is None:
907 return file
908
909 try:
910 info = lookup(encoding)
911 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
912 # Add attributes to simplify introspection
913 srw.encoding = encoding
914 return srw
915 except:
916 file.close()
917 raise
918
919def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
920
921 """ Return a wrapped version of file which provides transparent
922 encoding translation.
923
924 Data written to the wrapped file is decoded according
925 to the given data_encoding and then encoded to the underlying
926 file using file_encoding. The intermediate data type
927 will usually be Unicode but depends on the specified codecs.
928
929 Bytes read from the file are decoded using file_encoding and then
930 passed back to the caller encoded using data_encoding.
931
932 If file_encoding is not given, it defaults to data_encoding.
933
934 errors may be given to define the error handling. It defaults
935 to 'strict' which causes ValueErrors to be raised in case an
936 encoding error occurs.
937
938 The returned wrapped file object provides two extra attributes
939 .data_encoding and .file_encoding which reflect the given
940 parameters of the same name. The attributes can be used for
941 introspection by Python programs.
942
943 """
944 if file_encoding is None:
945 file_encoding = data_encoding
946 data_info = lookup(data_encoding)
947 file_info = lookup(file_encoding)
948 sr = StreamRecoder(file, data_info.encode, data_info.decode,
949 file_info.streamreader, file_info.streamwriter, errors)
950 # Add attributes to simplify introspection
951 sr.data_encoding = data_encoding
952 sr.file_encoding = file_encoding
953 return sr
954
955### Helpers for codec lookup
956
957def getencoder(encoding):
958
959 """ Lookup up the codec for the given encoding and return
960 its encoder function.
961
962 Raises a LookupError in case the encoding cannot be found.
963
964 """
965 return lookup(encoding).encode
966
967def getdecoder(encoding):
968
969 """ Lookup up the codec for the given encoding and return
970 its decoder function.
971
972 Raises a LookupError in case the encoding cannot be found.
973
974 """
975 return lookup(encoding).decode
976
977def getincrementalencoder(encoding):
978
979 """ Lookup up the codec for the given encoding and return
980 its IncrementalEncoder class or factory function.
981
982 Raises a LookupError in case the encoding cannot be found
983 or the codecs doesn't provide an incremental encoder.
984
985 """
986 encoder = lookup(encoding).incrementalencoder
987 if encoder is None:
988 raise LookupError(encoding)
989 return encoder
990
991def getincrementaldecoder(encoding):
992
993 """ Lookup up the codec for the given encoding and return
994 its IncrementalDecoder class or factory function.
995
996 Raises a LookupError in case the encoding cannot be found
997 or the codecs doesn't provide an incremental decoder.
998
999 """
1000 decoder = lookup(encoding).incrementaldecoder
1001 if decoder is None:
1002 raise LookupError(encoding)
1003 return decoder
1004
1005def getreader(encoding):
1006
1007 """ Lookup up the codec for the given encoding and return
1008 its StreamReader class or factory function.
1009
1010 Raises a LookupError in case the encoding cannot be found.
1011
1012 """
1013 return lookup(encoding).streamreader
1014
1015def getwriter(encoding):
1016
1017 """ Lookup up the codec for the given encoding and return
1018 its StreamWriter class or factory function.
1019
1020 Raises a LookupError in case the encoding cannot be found.
1021
1022 """
1023 return lookup(encoding).streamwriter
1024
1025def iterencode(iterator, encoding, errors='strict', **kwargs):
1026 """
1027 Encoding iterator.
1028
1029 Encodes the input strings from the iterator using an IncrementalEncoder.
1030
1031 errors and kwargs are passed through to the IncrementalEncoder
1032 constructor.
1033 """
1034 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1035 for input in iterator:
1036 output = encoder.encode(input)
1037 if output:
1038 yield output
1039 output = encoder.encode("", True)
1040 if output:
1041 yield output
1042
1043def iterdecode(iterator, encoding, errors='strict', **kwargs):
1044 """
1045 Decoding iterator.
1046
1047 Decodes the input strings from the iterator using an IncrementalDecoder.
1048
1049 errors and kwargs are passed through to the IncrementalDecoder
1050 constructor.
1051 """
1052 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1053 for input in iterator:
1054 output = decoder.decode(input)
1055 if output:
1056 yield output
1057 output = decoder.decode(b"", True)
1058 if output:
1059 yield output
1060
1061### Helpers for charmap-based codecs
1062
1063def make_identity_dict(rng):
1064
1065 """ make_identity_dict(rng) -> dict
1066
1067 Return a dictionary where elements of the rng sequence are
1068 mapped to themselves.
1069
1070 """
1071 return {i:i for i in rng}
1072
1073def make_encoding_map(decoding_map):
1074
1075 """ Creates an encoding map from a decoding map.
1076
1077 If a target mapping in the decoding map occurs multiple
1078 times, then that target is mapped to None (undefined mapping),
1079 causing an exception when encountered by the charmap codec
1080 during translation.
1081
1082 One example where this happens is cp875.py which decodes
1083 multiple character to \\u001a.
1084
1085 """
1086 m = {}
1087 for k,v in decoding_map.items():
1088 if not v in m:
1089 m[v] = k
1090 else:
1091 m[v] = None
1092 return m
1093
1094### error handlers
1095
1096try:
1097 strict_errors = lookup_error("strict")
1098 ignore_errors = lookup_error("ignore")
1099 replace_errors = lookup_error("replace")
1100 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1101 backslashreplace_errors = lookup_error("backslashreplace")
1102 namereplace_errors = lookup_error("namereplace")
1103except LookupError:
1104 # In --disable-unicode builds, these error handler are missing
1105 strict_errors = None
1106 ignore_errors = None
1107 replace_errors = None
1108 xmlcharrefreplace_errors = None
1109 backslashreplace_errors = None
1110 namereplace_errors = None
1111
1112# Tell modulefinder that using codecs probably needs the encodings
1113# package
1114_false = 0
1115if _false:
1116 import encodings
1117
1118### Tests
1119
1120if __name__ == '__main__':
1121
1122 # Make stdout translate Latin-1 output into UTF-8 output
1123 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1124
1125 # Have stdin translate Latin-1 input into UTF-8 input
1126 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')