"""
Python Codec for XML decoding
"""

import codecs

### Codec APIs

def _detectencoding(input, final):
	# See http://www.w3.org/TR/2004/REC-xml-20040204/#sec-guessing
	if input.startswith("\x00\x00\xfe\xff") or input.startswith("\xff\xfe\x00\x00"): # UTF-32
		raise UnicodeError("can't handle UTF-32")
	elif input.startswith("\x00\x00\x00\x3c"): # UTF-32-BE
		raise UnicodeError("can't handle UTF-32-BE")
	elif input.startswith("\x3c\x00\x00\x00"): # UTF-32-LE
		raise UnicodeError("can't handle UTF-32-LE")
	elif input.startswith("\xfe\xff"): # UTF-16 (BE variant)
		return "utf-16"
	elif input.startswith("\xff\xfe"): # UTF-16 (LE variant)
		if len(input) > 2 or final:
			return "utf-16"
		return None # it might turn out to be UTF-32 later
	elif input.startswith("\x00\x3c"): # UTF-16-BE
		return "utf-16-be"
	elif input.startswith("\x3c\x00"): # UTF-16-LE
		if len(input) > 2 or final:
			return "utf-16-le"
		return None # it might turn out to be UTF-32-LE latter
	elif input.startswith("\xef\xbb\xbf"): # UTF-8-SIG
		return "utf-8-sig"
	else:
		if input.startswith("<?xml"): # We have an XML header in an ASCII compatible encoding
			pos = input.find("encoding=")
			if pos >= 0:
				c = input[pos+9:pos+10] # might be empty
				if c:
					pos2 = input.find(c, pos+11)
					if pos2 >= 0:
						encoding = input[pos+10:pos2]
						return encoding
		elif len(input)>=5 or final: # doesn't start with an XML header
			return "utf-8"
	return None


def decode(input, errors='strict'):
	encoding = _detectencoding(input, True)
	if encoding is None:
		raise UnicodeError("can't detect encoding")
	return codecs.getdecoder(encoding)(input, errors)

class IncrementalDecoder(codecs.IncrementalDecoder):
	def __init__(self, errors='strict'):
		self.decoder = None
		codecs.IncrementalDecoder.__init__(self, errors)
		self._errors = self.errors # copy over the attribute, because we have to hide it in a property
		self.buffer = ""

	def decode(self, input, final=False):
		if self.decoder is None:
			input = self.buffer + input
			encoding = _detectencoding(input, final)
			if encoding is not None:
				self.decoder = codecs.getincrementaldecoder(encoding)(self.errors)
				self.buffer = "" # isn't needed any more, as the decoder might keep its own buffer
			else:
				if final: # This is the last call, and we haven't determined an encoding yet => complain
					raise UnicodeError("can't detect encoding")
				self.buffer = input # retry the complete input on the next call
				return u"" # no encoding determined yet, so no output
		return self.decoder.decode(input, final)

	def reset(self):
		codecs.IncrementalDecoder.reset(self)
		self.decoder = None
		self.buffer = ""

	def _geterrors(self):
		return self._errors

	def _seterrors(self, errors):
		# Setting errors must be done on the real decoder too
		if self.decoder is not None:
			self.decoder.errors = errors
		self._errors = errors
	errors = property(_geterrors, _seterrors)

class StreamReader(codecs.StreamReader):
	def __init__(self, stream, errors="strict"):
		codecs.StreamReader.__init__(self, stream, errors)
		self.decoder = IncrementalDecoder(errors)
		self._errors = errors

	def decode(self, input, errors='strict'):
		return (self.decoder.decode(input, False), len(input))

	def _geterrors(self):
		return self._errors

	def _seterrors(self, errors):
		# Setting errors must be done on the real decoder too
		if self.decoder is not None:
			self.decoder.errors = errors
		self._errors = errors
	errors = property(_geterrors, _seterrors)

def search_function(name):
	if name == "xml":
		return codecs.CodecInfo(
			name="xml",
			encode=None,
			decode=decode,
			incrementalencoder=None,
			incrementaldecoder=IncrementalDecoder,
			streamwriter=None,
			streamreader=StreamReader,
		)

codecs.register(search_function)
