encoding.h


    1 /*
    2  * encoding.h : interface for the encoding conversion functions needed for
    3  *              XML
    4  *
    5  * Related specs: 
    6  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
    7  * [ISO-10646]    UTF-8 and UTF-16 in Annexes
    8  * [ISO-8859-1]   ISO Latin-1 characters codes.
    9  * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
    10  *                Worldwide Character Encoding -- Version 1.0", Addison-
    11  *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
    12  *                described in Unicode Technical Report #4.
    13  * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
    14  *                Information Interchange, ANSI X3.4-1986.
    15  *
    16  * See Copyright for the status of this software.
    17  *
    18  * daniel@veillard.com
    19  */
    20 
    21 #ifndef __XML_CHAR_ENCODING_H__
    22 #define __XML_CHAR_ENCODING_H__
    23 
    24 #include 
    25 
    26 #ifdef LIBXML_ICONV_ENABLED
    27 #include 
    28 #endif
    29 #ifdef __cplusplus
    30 extern "C" {
    31 #endif
    32 
    33 /**
    34  * xmlCharEncoding:
    35  *
    36  * Predefined values for some standard encodings.
    37  * Libxml don't do beforehand translation on UTF8, ISOLatinX.
    38  * It also support UTF16 (LE and BE) by default.
    39  *
    40  * Anything else would have to be translated to UTF8 before being
    41  * given to the parser itself. The BOM for UTF16 and the encoding
    42  * declaration are looked at and a converter is looked for at that
    43  * point. If not found the parser stops here as asked by the XML REC
    44  * Converter can be registered by the user using xmlRegisterCharEncodingHandler
    45  * but the current form doesn't allow stateful transcoding (a serious
    46  * problem agreed !). If iconv has been found it will be used
    47  * automatically and allow stateful transcoding, the simplest is then
    48  * to be sure to enable icon and to provide iconv libs for the encoding
    49  * support needed.
    50  */
    51 typedef enum { <XML_CHAR_ENCODING_ERROR> <XML_CHAR_ENCODING_NONE> <XML_CHAR_ENCODING_UTF8> <XML_CHAR_ENCODING_UTF16LE> <XML_CHAR_ENCODING_UTF16BE> <XML_CHAR_ENCODING_UCS4LE> <XML_CHAR_ENCODING_UCS4BE> <XML_CHAR_ENCODING_EBCDIC> <XML_CHAR_ENCODING_UCS4_2143> <XML_CHAR_ENCODING_UCS4_3412> <XML_CHAR_ENCODING_UCS2> <XML_CHAR_ENCODING_8859_1> <XML_CHAR_ENCODING_8859_2> <XML_CHAR_ENCODING_8859_3> <XML_CHAR_ENCODING_8859_4> <XML_CHAR_ENCODING_8859_5> <XML_CHAR_ENCODING_8859_6> <XML_CHAR_ENCODING_8859_7> <XML_CHAR_ENCODING_8859_8> <XML_CHAR_ENCODING_8859_9> <XML_CHAR_ENCODING_2022_JP> <XML_CHAR_ENCODING_SHIFT_JIS> <XML_CHAR_ENCODING_EUC_JP> <XML_CHAR_ENCODING_ASCII>
    52     XML_CHAR_ENCODING_ERROR=   -1, /* No char encoding detected */
    53     XML_CHAR_ENCODING_NONE=	0, /* No char encoding detected */
    54     XML_CHAR_ENCODING_UTF8=	1, /* UTF-8 */
    55     XML_CHAR_ENCODING_UTF16LE=	2, /* UTF-16 little endian */
    56     XML_CHAR_ENCODING_UTF16BE=	3, /* UTF-16 big endian */
    57     XML_CHAR_ENCODING_UCS4LE=	4, /* UCS-4 little endian */
    58     XML_CHAR_ENCODING_UCS4BE=	5, /* UCS-4 big endian */
    59     XML_CHAR_ENCODING_EBCDIC=	6, /* EBCDIC uh! */
    60     XML_CHAR_ENCODING_UCS4_2143=7, /* UCS-4 unusual ordering */
    61     XML_CHAR_ENCODING_UCS4_3412=8, /* UCS-4 unusual ordering */
    62     XML_CHAR_ENCODING_UCS2=	9, /* UCS-2 */
    63     XML_CHAR_ENCODING_8859_1=	10,/* ISO-8859-1 ISO Latin 1 */
    64     XML_CHAR_ENCODING_8859_2=	11,/* ISO-8859-2 ISO Latin 2 */
    65     XML_CHAR_ENCODING_8859_3=	12,/* ISO-8859-3 */
    66     XML_CHAR_ENCODING_8859_4=	13,/* ISO-8859-4 */
    67     XML_CHAR_ENCODING_8859_5=	14,/* ISO-8859-5 */
    68     XML_CHAR_ENCODING_8859_6=	15,/* ISO-8859-6 */
    69     XML_CHAR_ENCODING_8859_7=	16,/* ISO-8859-7 */
    70     XML_CHAR_ENCODING_8859_8=	17,/* ISO-8859-8 */
    71     XML_CHAR_ENCODING_8859_9=	18,/* ISO-8859-9 */
    72     XML_CHAR_ENCODING_2022_JP=  19,/* ISO-2022-JP */
    73     XML_CHAR_ENCODING_SHIFT_JIS=20,/* Shift_JIS */
    74     XML_CHAR_ENCODING_EUC_JP=   21,/* EUC-JP */
    75     XML_CHAR_ENCODING_ASCII=    22 /* pure ASCII */
    76 } xmlCharEncoding;  <typedef:xmlCharEncoding>
    77 
    78 /**
    79  * xmlCharEncodingInputFunc:
    80  * @out:  a pointer to an array of bytes to store the UTF-8 result
    81  * @outlen:  the length of @out
    82  * @in:  a pointer to an array of chars in the original encoding
    83  * @inlen:  the length of @in
    84  *
    85  * Take a block of chars in the original encoding and try to convert
    86  * it to an UTF-8 block of chars out.
    87  *
    88  * Returns the number of byte written, or -1 by lack of space, or -2
    89  *     if the transcoding failed.
    90  * The value of @inlen after return is the number of octets consumed
    91  *     as the return value is positive, else unpredictiable.
    92  * The value of @outlen after return is the number of octets consumed.
    93  */
    94 typedef int (* xmlCharEncodingInputFunc)(unsigned char *out, int *outlen,
    95                                          const unsigned char *in, int *inlen);  <typedef:xmlCharEncodingInputFunc>
    96 
    97 
    98 /**
    99  * xmlCharEncodingOutputFunc:
    100  * @out:  a pointer to an array of bytes to store the result
    101  * @outlen:  the length of @out
    102  * @in:  a pointer to an array of UTF-8 chars
    103  * @inlen:  the length of @in
    104  *
    105  * Take a block of UTF-8 chars in and try to convert it to an other
    106  * encoding.
    107  * Note: a first call designed to produce heading info is called with
    108  * in = NULL. If stateful this should also initialize the encoder state.
    109  *
    110  * Returns the number of byte written, or -1 by lack of space, or -2
    111  *     if the transcoding failed.
    112  * The value of @inlen after return is the number of octets consumed
    113  *     as the return value is positive, else unpredictiable.
    114  * The value of @outlen after return is the number of ocetes consumed.
    115  */
    116 typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
    117                                           const unsigned char *in, int *inlen);  <typedef:xmlCharEncodingOutputFunc>
    118 
    119 
    120 /*
    121  * Block defining the handlers for non UTF-8 encodings.
    122  * If iconv is supported, there is two extra fields.
    123  */
    124 
    125 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;  <_xmlCharEncodingHandler>
    126 typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr;  <typedef:xmlCharEncodingHandlerPtr>
    127 struct _xmlCharEncodingHandler {
    128     char                       *name;
    129     xmlCharEncodingInputFunc   input;
    130     xmlCharEncodingOutputFunc  output;
    131 #ifdef LIBXML_ICONV_ENABLED
    132     iconv_t                    iconv_in;
    133     iconv_t                    iconv_out;
    134 #endif /* LIBXML_ICONV_ENABLED */
    135 };
    136 
    137 #ifdef __cplusplus
    138 }
    139 #endif
    140 #include 
    141 #ifdef __cplusplus
    142 extern "C" {
    143 #endif
    144 
    145 /*
    146  * Interfaces for encoding handlers.
    147  */
    148 void	xmlInitCharEncodingHandlers	(void);
    149 void	xmlCleanupCharEncodingHandlers	(void);
    150 void	xmlRegisterCharEncodingHandler	(xmlCharEncodingHandlerPtr handler);
    151 xmlCharEncodingHandlerPtr
    152 	xmlGetCharEncodingHandler	(xmlCharEncoding enc);
    153 xmlCharEncodingHandlerPtr
    154 	xmlFindCharEncodingHandler	(const char *name);
    155 xmlCharEncodingHandlerPtr
    156 	xmlNewCharEncodingHandler	(const char *name, 
    157                           		 xmlCharEncodingInputFunc input,
    158                           		 xmlCharEncodingOutputFunc output);
    159 
    160 /*
    161  * Interfaces for encoding names and aliases.
    162  */
    163 int	xmlAddEncodingAlias		(const char *name,
    164 					 const char *alias);
    165 int	xmlDelEncodingAlias		(const char *alias);
    166 const char *
    167 	xmlGetEncodingAlias		(const char *alias);
    168 void	xmlCleanupEncodingAliases	(void);
    169 xmlCharEncoding
    170 	xmlParseCharEncoding		(const char *name);
    171 const char *
    172 	xmlGetCharEncodingName		(xmlCharEncoding enc);
    173 
    174 /*
    175  * Interfaces directly used by the parsers.
    176  */
    177 xmlCharEncoding
    178 	xmlDetectCharEncoding		(const unsigned char *in,
    179 					 int len);
    180 
    181 int	xmlCharEncOutFunc		(xmlCharEncodingHandler *handler,
    182 					 xmlBufferPtr out,
    183 					 xmlBufferPtr in);
    184 
    185 int	xmlCharEncInFunc		(xmlCharEncodingHandler *handler,
    186 					 xmlBufferPtr out,
    187 					 xmlBufferPtr in);
    188 int	xmlCharEncFirstLine		(xmlCharEncodingHandler *handler,
    189 					 xmlBufferPtr out,
    190 					 xmlBufferPtr in);
    191 int	xmlCharEncCloseFunc		(xmlCharEncodingHandler *handler);
    192 
    193 /*
    194  * Export a few useful functions
    195  */
    196 int	UTF8Toisolat1			(unsigned char *out,
    197 					 int *outlen,
    198 					 const unsigned char *in,
    199 					 int *inlen);
    200 int	isolat1ToUTF8			(unsigned char *out,
    201 					 int *outlen,
    202 					 const unsigned char *in,
    203 					 int *inlen);
    204 int	xmlGetUTF8Char			(const unsigned char *utf,
    205 					 int *len);
    206 /*
    207  * exports additional "UTF-8 aware" string routines which are.
    208  */
    209 
    210 int	xmlCheckUTF8			(const unsigned char *utf);
    211 
    212 int	xmlUTF8Strsize			(const xmlChar *utf,
    213 					 int len);
    214 xmlChar * xmlUTF8Strndup		(const xmlChar *utf,
    215 					 int len);
    216 xmlChar * xmlUTF8Strpos			(const xmlChar *utf,
    217 					 int pos);
    218 int	xmlUTF8Strloc			(const xmlChar *utf,
    219 					 const xmlChar *utfchar);
    220 xmlChar * xmlUTF8Strsub			(const xmlChar *utf,
    221 					 int start,
    222 					 int len);
    223 
    224 int	xmlUTF8Strlen			(const xmlChar *utf);
    225 
    226 #ifdef __cplusplus
    227 }
    228 #endif
    229 
    230 #endif /* __XML_CHAR_ENCODING_H__ */