charset.h
/*
* Copyright (C) 2001 Edmund Grimley Evans <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <stdlib.h>
/*
* These functions are like the C library's mbtowc() and wctomb(),
* but instead of depending on the locale they always work in UTF-8,
* and they use int instead of wchar_t.
*/
int utf8_mbtowc(int *pwc, const char *s, size_t n);
int utf8_wctomb(char *s, int wc);
/*
* This is an object-oriented version of mbtowc() and wctomb().
* The caller first uses charset_find() to get a pointer to struct
* charset, then uses the mbtowc() and wctomb() methods on it.
* The function charset_max() gives the maximum length of a
* multibyte character in that encoding.
* This API is only appropriate for stateless encodings like UTF-8
* or ISO-8859-3, but I have no intention of implementing anything
* other than UTF-8 and 8-bit encodings.
*
* MINOR BUG: If there is no memory charset_find() may return 0 and
* there is no way to distinguish this case from an unknown encoding.
*/
struct charset;
struct charset *charset_find(const char *code);
int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n);
int charset_wctomb(struct charset *charset, char *s, int wc);
int charset_max(struct charset *charset);
/*
* Function to convert a buffer from one encoding to another.
* Invalid bytes are replaced by '#', and characters that are
* not available in the target encoding are replaced by '?'.
* Each of TO and TOLEN may be zero if the result is not wanted.
* The input or output may contain null bytes, but the output
* buffer is also null-terminated, so it is all right to
* use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
*
* Return value:
*
* -2 : memory allocation failed
* -1 : unknown encoding
* 0 : data was converted exactly
* 1 : valid data was converted approximately (using '?')
* 2 : input was invalid (but still converted, using '#')
*/
int charset_convert(const char *fromcode, const char *tocode,
const char *from, size_t fromlen,
char **to, size_t *tolen);
/*
* Copyright (C) 2001 Edmund Grimley Evans <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* See the corresponding header file for a description of the functions
* that this file provides.
*
* This was first written for Ogg Vorbis but could be of general use.
*
* The only deliberate assumption about data sizes is that a short has
* at least 16 bits, but this code has only been tested on systems with
* 8-bit char, 16-bit short and 32-bit int.
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
#include <stdlib.h>
#include "charset.h"
#include "charmaps.h"
/*
* This is like the standard strcasecmp, but it does not depend
* on the locale. Locale-dependent functions can be dangerous:
* we once had a bug involving strcasecmp("iso", "ISO") in a
* Turkish locale!
*
* (I'm not really sure what the official standard says
* about the sign of strcasecmp("Z", "["), but usually
* we're only interested in whether it's zero.)
*/
static int ascii_strcasecmp(const char *s1, const char *s2)
{
char c1, c2;
for (;; s1++, s2++) {
if (!*s1 || !*s1)
break;
if (*s1 == *s2)
continue;
c1 = *s1;
if ('a' <= c1 && c1 <= 'z')
c1 += 'A' - 'a';
c2 = *s2;
if ('a' <= c2 && c2 <= 'z')
c2 += 'A' - 'a';
if (c1 != c2)
break;
}
return (unsigned char)*s1 - (unsigned char)*s2;
}
/*
* UTF-8 equivalents of the C library's wctomb() and mbtowc().
*/
int utf8_mbtowc(int *pwc, const char *s, size_t n)
{
unsigned char c;
int wc, i, k;
if (!n || !s)
return 0;
c = *s;
if (c < 0x80) {
if (pwc)
*pwc = c;
return c ? 1 : 0;
}
else if (c < 0xc2)
return -1;
else if (c < 0xe0) {
if (n >= 2 && (s[1] & 0xc0) == 0x80) {
if (pwc)
*pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
return 2;
}
else
return -1;
}
else if (c < 0xf0)
k = 3;
else if (c < 0xf8)
k = 4;
else if (c < 0xfc)
k = 5;
else if (c < 0xfe)
k = 6;
else
return -1;
if (n < (size_t)k)
return -1;
wc = *s++ & ((1 << (7 - k)) - 1);
for (i = 1; i < k; i++) {
if ((*s & 0xc0) != 0x80)
return -1;
wc = (wc << 6) | (*s++ & 0x3f);
}
if (wc < (1 << (5 * k - 4)))
return -1;
if (pwc)
*pwc = wc;
return k;
}
int utf8_wctomb(char *s, int wc1)
{
unsigned int wc = wc1;
if (!s)
return 0;
if (wc < (1u << 7)) {
*s++ = wc;
return 1;
}
else if (wc < (1u << 11)) {
*s++ = 0xc0 | (wc >> 6);
*s++ = 0x80 | (wc & 0x3f);
return 2;
}
else if (wc < (1u << 16)) {
*s++ = 0xe0 | (wc >> 12);
*s++ = 0x80 | ((wc >> 6) & 0x3f);
*s++ = 0x80 | (wc & 0x3f);
return 3;
}
else if (wc < (1u << 21)) {
*s++ = 0xf0 | (wc >> 18);
*s++ = 0x80 | ((wc >> 12) & 0x3f);
*s++ = 0x80 | ((wc >> 6) & 0x3f);
*s++ = 0x80 | (wc & 0x3f);
return 4;
}
else if (wc < (1u << 26)) {
*s++ = 0xf8 | (wc >> 24);
*s++ = 0x80 | ((wc >> 18) & 0x3f);
*s++ = 0x80 | ((wc >> 12) & 0x3f);
*s++ = 0x80 | ((wc >> 6) & 0x3f);
*s++ = 0x80 | (wc & 0x3f);
return 5;
}
else if (wc < (1u << 31)) {
*s++ = 0xfc | (wc >> 30);
*s++ = 0x80 | ((wc >> 24) & 0x3f);
*s++ = 0x80 | ((wc >> 18) & 0x3f);
*s++ = 0x80 | ((wc >> 12) & 0x3f);
*s++ = 0x80 | ((wc >> 6) & 0x3f);
*s++ = 0x80 | (wc & 0x3f);
return 6;
}
else
return -1;
}
/*
* The charset "object" and methods.
*/
struct charset {
int max;
int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
int (*wctomb)(void *table, char *s, int wc);
void *map;
};
int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
{
return (*charset->mbtowc)(charset->map, pwc, s, n);
}
int charset_wctomb(struct charset *charset, char *s, int wc)
{
return (*charset->wctomb)(charset->map, s, wc);
}
int charset_max(struct charset *charset)
{
return charset->max;
}
/*
* Implementation of UTF-8.
*/
static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
{
(void)map;
return utf8_mbtowc(pwc, s, n);
}
static int wctomb_utf8(void *map, char *s, int wc)
{
(void)map;
return utf8_wctomb(s, wc);
}
/*
* Implementation of US-ASCII.
* Probably on most architectures this compiles to less than 256 bytes
* of code, so we can save space by not having a table for this one.
*/
static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
{
int wc;
(void)map;
if (!n || !s)
return 0;
wc = (unsigned char)*s;
if (wc & ~0x7f)
return -1;
if (pwc)
*pwc = wc;
return wc ? 1 : 0;
}
static int wctomb_ascii(void *map, char *s, int wc)
{
(void)map;
if (!s)
return 0;
if (wc & ~0x7f)
return -1;
*s = wc;
return 1;
}
/*
* Implementation of ISO-8859-1.
* Probably on most architectures this compiles to less than 256 bytes
* of code, so we can save space by not having a table for this one.
*/
static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
{
int wc;
(void)map;
if (!n || !s)
return 0;
wc = (unsigned char)*s;
if (wc & ~0xff)
return -1;
if (pwc)
*pwc = wc;
return wc ? 1 : 0;
}
static int wctomb_iso1(void *map, char *s, int wc)
{
(void)map;
if (!s)
return 0;
if (wc & ~0xff)
return -1;
*s = wc;
return 1;
}
/*
* Implementation of any 8-bit charset.
*/
struct map {
const unsigned short *from;
struct inverse_map *to;
};
static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
{
struct map *map = map1;
unsigned short wc;
if (!n || !s)
return 0;
wc = map->from[(unsigned char)*s];
if (wc == 0xffff)
return -1;
if (pwc)
*pwc = (int)wc;
return wc ? 1 : 0;
}
/*
* For the inverse map we use a hash table, which has the advantages
* of small constant memory requirement and simple memory allocation,
* but the disadvantage of slow conversion in the worst case.
* If you need real-time performance while letting a potentially
* malicious user define their own map, then the method used in
* linux/drivers/char/consolemap.c would be more appropriate.
*/
struct inverse_map {
unsigned char first[256];
unsigned char next[256];
};
/*
* The simple hash is good enough for this application.
* Use the alternative trivial hashes for testing.
*/
#define HASH(i) ((i) & 0xff)
/* #define HASH(i) 0 */
/* #define HASH(i) 99 */
static struct inverse_map *make_inverse_map(const unsigned short *from)
{
struct inverse_map *to;
char used[256];
int i, j, k;
to = (struct inverse_map *)malloc(sizeof(struct inverse_map));
if (!to)
return 0;
for (i = 0; i < 256; i++)
to->first[i] = to->next[i] = used[i] = 0;
for (i = 255; i >= 0; i--)
if (from[i] != 0xffff) {
k = HASH(from[i]);
to->next[i] = to->first[k];
to->first[k] = i;
used[k] = 1;
}
/* Point the empty buckets at an empty list. */
for (i = 0; i < 256; i++)
if (!to->next[i])
break;
if (i < 256)
for (j = 0; j < 256; j++)
if (!used[j])
to->first[j] = i;
return to;
}
int wctomb_8bit(void *map1, char *s, int wc1)
{
struct map *map = map1;
unsigned short wc = wc1;
int i;
if (!s)
return 0;
if (wc1 & ~0xffff)
return -1;
if (1) /* Change 1 to 0 to test the case where malloc fails. */
if (!map->to)
map->to = make_inverse_map(map->from);
if (map->to) {
/* Use the inverse map. */
i = map->to->first[HASH(wc)];
for (;;) {
if (map->from[i] == wc) {
*s = i;
return 1;
}
if (!(i = map->to->next[i]))
break;
}
}
else {
/* We don't have an inverse map, so do a linear search. */
for (i = 0; i < 256; i++)
if (map->from[i] == wc) {
*s = i;
return 1;
}
}
return -1;
}
/*
* The "constructor" charset_find().
*/
struct charset charset_utf8 = {
6,
&mbtowc_utf8,
&wctomb_utf8,
0
};
struct charset charset_iso1 = {
1,
&mbtowc_iso1,
&wctomb_iso1,
0
};
struct charset charset_ascii = {
1,
&mbtowc_ascii,
&wctomb_ascii,
0
};
struct charset *charset_find(const char *code)
{
int i;
/* Find good (MIME) name. */
for (i = 0; names[i].bad; i++)
if (!ascii_strcasecmp(code, names[i].bad)) {
code = names[i].good;
break;
}
/* Recognise some charsets for which we avoid using a table. */
if (!ascii_strcasecmp(code, "UTF-8"))
return &charset_utf8;
if (!ascii_strcasecmp(code, "US-ASCII"))
return &charset_ascii;
if (!ascii_strcasecmp(code, "ISO-8859-1"))
return &charset_iso1;
/* Look for a mapping for a simple 8-bit encoding. */
for (i = 0; maps[i].name; i++)
if (!ascii_strcasecmp(code, maps[i].name)) {
if (!maps[i].charset) {
maps[i].charset = (struct charset *)malloc(sizeof(struct charset));
if (maps[i].charset) {
struct map *map = (struct map *)malloc(sizeof(struct map));
if (!map) {
free(maps[i].charset);
maps[i].charset = 0;
}
else {
maps[i].charset->max = 1;
maps[i].charset->mbtowc = &mbtowc_8bit;
maps[i].charset->wctomb = &wctomb_8bit;
maps[i].charset->map = map;
map->from = maps[i].map;
map->to = 0; /* inverse mapping is created when required */
}
}
}
return maps[i].charset;
}
return 0;
}
/*
* Function to convert a buffer from one encoding to another.
* Invalid bytes are replaced by '#', and characters that are
* not available in the target encoding are replaced by '?'.
* Each of TO and TOLEN may be zero, if the result is not needed.
* The output buffer is null-terminated, so it is all right to
* use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
*/
int charset_convert(const char *fromcode, const char *tocode,
const char *from, size_t fromlen,
char **to, size_t *tolen)
{
int ret = 0;
struct charset *charset1, *charset2;
char *tobuf, *p, *newbuf;
int i, j, wc;
charset1 = charset_find(fromcode);
charset2 = charset_find(tocode);
if (!charset1 || !charset2 )
return -1;
tobuf = (char *)malloc(fromlen * charset2->max + 1);
if (!tobuf)
return -2;
for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
i = charset_mbtowc(charset1, &wc, from, fromlen);
if (!i)
i = 1;
else if (i == -1) {
i = 1;
wc = '#';
ret = 2;
}
j = charset_wctomb(charset2, p, wc);
if (j == -1) {
if (!ret)
ret = 1;
j = charset_wctomb(charset2, p, '?');
if (j == -1)
j = 0;
}
}
if (tolen)
*tolen = p - tobuf;
*p++ = '\0';
if (to) {
newbuf = realloc(tobuf, p - tobuf);
*to = newbuf ? newbuf : tobuf;
}
else
free(tobuf);
return ret;
}
#endif /* USE_CHARSET_ICONV */
/*
* Copyright (C) 2001 Edmund Grimley Evans <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#ifdef HAVE_ICONV
#include <assert.h>
#include <errno.h>
#include <iconv.h>
#include <stdlib.h>
#include <string.h>
/*
* Convert data from one encoding to another. Return:
*
* -2 : memory allocation failed
* -1 : unknown encoding
* 0 : data was converted exactly
* 1 : data was converted inexactly
* 2 : data was invalid (but still converted)
*
* We convert in two steps, via UTF-8, as this is the only
* reliable way of distinguishing between invalid input
* and valid input which iconv refuses to transliterate.
* We convert from UTF-8 twice, because we have no way of
* knowing whether the conversion was exact if iconv returns
* E2BIG (due to a bug in the specification of iconv).
* An alternative approach is to assume that the output of
* iconv is never more than 4 times as long as the input,
* but I prefer to avoid that assumption if possible.
*/
int iconvert(const char *fromcode, const char *tocode,
const char *from, size_t fromlen,
char **to, size_t *tolen)
{
int ret = 0;
iconv_t cd1, cd2;
char *ib;
char *ob;
char *utfbuf = 0, *outbuf, *newbuf;
size_t utflen, outlen, ibl, obl, k;
char tbuf[2048];
cd1 = iconv_open("UTF-8", fromcode);
if (cd1 == (iconv_t)(-1))
return -1;
cd2 = (iconv_t)(-1);
/* Don't use strcasecmp() as it's locale-dependent. */
if (!strchr("Uu", tocode[0]) ||
!strchr("Tt", tocode[1]) ||
!strchr("Ff", tocode[2]) ||
tocode[3] != '-' ||
tocode[4] != '8' ||
tocode[5] != '\0') {
char *tocode1;
/*
* Try using this non-standard feature of glibc and libiconv.
* This is deliberately not a config option as people often
* change their iconv library without rebuilding applications.
*/
tocode1 = (char *)malloc(strlen(tocode) + 11);
if (!tocode1)
goto fail;
strcpy(tocode1, tocode);
strcat(tocode1, "//TRANSLIT");
cd2 = iconv_open(tocode1, "UTF-8");
free(tocode1);
if (cd2 == (iconv_t)(-1))
cd2 = iconv_open(tocode, fromcode);
if (cd2 == (iconv_t)(-1)) {
iconv_close(cd1);
return -1;
}
}
utflen = 1; /*fromlen * 2 + 1; XXX */
utfbuf = (char *)malloc(utflen);
if (!utfbuf)
goto fail;
/* Convert to UTF-8 */
ib = (char *)from;
ibl = fromlen;
ob = utfbuf;
obl = utflen;
for (;;) {
k = iconv(cd1, &ib, &ibl, &ob, &obl);
assert((!k && !ibl) ||
(k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) ||
(k == (size_t)(-1) &&
(errno == EILSEQ || errno == EINVAL) && ibl));
if (!ibl)
break;
if (obl < 6) {
/* Enlarge the buffer */
utflen *= 2;
newbuf = (char *)realloc(utfbuf, utflen);
if (!newbuf)
goto fail;
ob = (ob - utfbuf) + newbuf;
obl = utflen - (ob - newbuf);
utfbuf = newbuf;
}
else {
/* Invalid input */
ib++, ibl--;
*ob++ = '#', obl--;
ret = 2;
iconv(cd1, 0, 0, 0, 0);
}
}
if (cd2 == (iconv_t)(-1)) {
/* The target encoding was UTF-8 */
if (tolen)
*tolen = ob - utfbuf;
if (!to) {
free(utfbuf);
iconv_close(cd1);
return ret;
}
newbuf = (char *)realloc(utfbuf, (ob - utfbuf) + 1);
if (!newbuf)
goto fail;
ob = (ob - utfbuf) + newbuf;
*ob = '\0';
*to = newbuf;
iconv_close(cd1);
return ret;
}
/* Truncate the buffer to be tidy */
utflen = ob - utfbuf;
newbuf = (char *)realloc(utfbuf, utflen);
if (!newbuf)
goto fail;
utfbuf = newbuf;
/* Convert from UTF-8 to discover how long the output is */
outlen = 0;
ib = utfbuf;
ibl = utflen;
while (ibl) {
ob = tbuf;
obl = sizeof(tbuf);
k = iconv(cd2, &ib, &ibl, &ob, &obl);
assert((k != (size_t)(-1) && !ibl) ||
(k == (size_t)(-1) && errno == E2BIG && ibl) ||
(k == (size_t)(-1) && errno == EILSEQ && ibl));
if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
/* Replace one character */
char *tb = "?";
size_t tbl = 1;
outlen += ob - tbuf;
ob = tbuf;
obl = sizeof(tbuf);
k = iconv(cd2, &tb, &tbl, &ob, &obl);
assert((!k && !tbl) ||
(k == (size_t)(-1) && errno == EILSEQ && tbl));
for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
;
}
outlen += ob - tbuf;
}
ob = tbuf;
obl = sizeof(tbuf);
k = iconv(cd2, 0, 0, &ob, &obl);
assert(!k);
outlen += ob - tbuf;
/* Convert from UTF-8 for real */
outbuf = (char *)malloc(outlen + 1);
if (!outbuf)
goto fail;
ib = utfbuf;
ibl = utflen;
ob = outbuf;
obl = outlen;
while (ibl) {
k = iconv(cd2, &ib, &ibl, &ob, &obl);
assert((k != (size_t)(-1) && !ibl) ||
(k == (size_t)(-1) && errno == EILSEQ && ibl));
if (k && !ret)
ret = 1;
if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
/* Replace one character */
char *tb = "?";
size_t tbl = 1;
k = iconv(cd2, &tb, &tbl, &ob, &obl);
assert((!k && !tbl) ||
(k == (size_t)(-1) && errno == EILSEQ && tbl));
for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
;
}
}
k = iconv(cd2, 0, 0, &ob, &obl);
assert(!k);
assert(!obl);
*ob = '\0';
free(utfbuf);
iconv_close(cd1);
iconv_close(cd2);
if (tolen)
*tolen = outlen;
if (!to) {
free(outbuf);
return ret;
}
*to = outbuf;
return ret;
fail:
if(0 != utfbuf)
free(utfbuf);
iconv_close(cd1);
if (cd2 != (iconv_t)(-1))
iconv_close(cd2);
return -2;
}
#endif /* HAVE_ICONV */