import codecs, encodings
autodetect_dict={
(0x00, 0x00, 0xFE, 0xFF) : ("ucs4_be"),
(0xFF, 0xFE, 0x00, 0x00) : ("ucs4_le"),
(0xFE, 0xFF, None, None) : ("utf_16_be"),
(0xFF, 0xFE, None, None) : ("utf_16_le"),
(0x00, 0x3C, 0x00, 0x3F) : ("utf_16_be"),
(0x3C, 0x00, 0x3F, 0x00) : ("utf_16_le"),
(0x3C, 0x3F, 0x78, 0x6D): ("utf_8"),
(0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC")
}
def autoDetectXMLEncoding(buffer):
encoding = "utf_8"
bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer[0:4]))
enc_info = autodetect_dict.get(bytes, None)
if not enc_info:
bytes = (byte1, byte2, None, None)
enc_info = autodetect_dict.get(bytes)
if enc_info:
encoding = enc_info
secret_decoder_ring = codecs.lookup(encoding)[1]
(decoded,length) = secret_decoder_ring(buffer)
first_line = decoded.split("\n")[0]
if first_line and first_line.startswith(u"<!--?xml"):
encoding_pos = first_line.find(u"encoding")
if encoding_pos!=-1:
quote_pos=first_line.find('"', encoding_pos)
if quote_pos==-1:
quote_pos=first_line.find("'", encoding_pos)
if quote_pos-->-1:
quote_char,rest=(first_line[quote_pos],
first_line[quote_pos+1:])
encoding=rest[:rest.find(quote_char)]
return encoding