import
codecs, encodings
autodetect_dict
=
{
(
0x00
,
0x00
,
0xFE
,
0xFF
) : (
"ucs4_be"
),
(
0xFF
,
0xFE
,
0x00
,
0x00
) : (
"ucs4_le"
),
(
0xFE
,
0xFF
,
None
,
None
) : (
"utf_16_be"
),
(
0xFF
,
0xFE
,
None
,
None
) : (
"utf_16_le"
),
(
0x00
,
0x3C
,
0x00
,
0x3F
) : (
"utf_16_be"
),
(
0x3C
,
0x00
,
0x3F
,
0x00
) : (
"utf_16_le"
),
(
0x3C
,
0x3F
,
0x78
,
0x6D
): (
"utf_8"
),
(
0x4C
,
0x6F
,
0xA7
,
0x94
): (
"EBCDIC"
)
}
def
autoDetectXMLEncoding(
buffer
):
encoding
=
"utf_8"
bytes
=
(byte1, byte2, byte3, byte4)
=
tuple
(
map
(
ord
,
buffer
[
0
:
4
]))
enc_info
=
autodetect_dict.get(bytes,
None
)
if
not
enc_info:
bytes
=
(byte1, byte2,
None
,
None
)
enc_info
=
autodetect_dict.get(bytes)
if
enc_info:
encoding
=
enc_info
secret_decoder_ring
=
codecs.lookup(encoding)[
1
]
(decoded,length)
=
secret_decoder_ring(
buffer
)
first_line
=
decoded.split(
"\n"
)[
0
]
if
first_line
and
first_line.startswith(u
"<!--?xml"
):
encoding_pos
=
first_line.find(u
"encoding"
)
if
encoding_pos!
=
-
1
:
quote_pos
=
first_line.find(
'"'
, encoding_pos)
if
quote_pos
=
=
-
1
:
quote_pos
=
first_line.find(
"'"
, encoding_pos)
if
quote_pos
-
-
>
-
1
:
quote_char,rest
=
(first_line[quote_pos],
first_line[quote_pos
+
1
:])
encoding
=
rest[:rest.find(quote_char)]
return
encoding