Mercurial > hg > octave-shane > gnulib-hg

/* Character set conversion with error handling.
   Copyright (C) 2001-2008 Free Software Foundation, Inc.
   Written by Bruno Haible and Simon Josefsson.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

#include <config.h>

/* Specification.  */
#include "striconveh.h"

#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>

#if HAVE_ICONV
# include <iconv.h>
# include "unistr.h"
#endif

#include "c-strcase.h"
#include "c-strcaseeq.h"

#ifndef SIZE_MAX
# define SIZE_MAX ((size_t) -1)
#endif


#if HAVE_ICONV

/* The caller must provide CD, CD1, CD2, not just CD, because when a conversion
   error occurs, we may have to determine the Unicode representation of the
   inconvertible character.  */

/* iconv_carefully is like iconv, except that it stops as soon as it encounters
   a conversion error, and it returns in *INCREMENTED a boolean telling whether
   it has incremented the input pointers past the error location.  */
# if !defined _LIBICONV_VERSION && !defined __GLIBC__
/* Irix iconv() inserts a NUL byte if it cannot convert.
   NetBSD iconv() inserts a question mark if it cannot convert.
   Only GNU libiconv and GNU libc are known to prefer to fail rather
   than doing a lossy conversion.  */
static size_t
iconv_carefully (iconv_t cd,
		 const char **inbuf, size_t *inbytesleft,
		 char **outbuf, size_t *outbytesleft,
		 bool *incremented)
{
  const char *inptr = *inbuf;
  const char *inptr_end = inptr + *inbytesleft;
  char *outptr = *outbuf;
  size_t outsize = *outbytesleft;
  const char *inptr_before;
  size_t res;

  do
    {
      size_t insize;

      inptr_before = inptr;
      res = (size_t)(-1);

      for (insize = 1; inptr + insize <= inptr_end; insize++)
	{
	  res = iconv (cd,
		       (ICONV_CONST char **) &inptr, &insize,
		       &outptr, &outsize);
	  if (!(res == (size_t)(-1) && errno == EINVAL))
	    break;
	  /* iconv can eat up a shift sequence but give EINVAL while attempting
	     to convert the first character.  E.g. libiconv does this.  */
	  if (inptr > inptr_before)
	    {
	      res = 0;
	      break;
	    }
	}

      if (res == 0)
	{
	  *outbuf = outptr;
	  *outbytesleft = outsize;
	}
    }
  while (res == 0 && inptr < inptr_end);

  *inbuf = inptr;
  *inbytesleft = inptr_end - inptr;
  if (res != (size_t)(-1) && res > 0)
    {
      /* iconv() has already incremented INPTR.  We cannot go back to a
	 previous INPTR, otherwise the state inside CD would become invalid,
	 if FROM_CODESET is a stateful encoding.  So, tell the caller that
	 *INBUF has already been incremented.  */
      *incremented = (inptr > inptr_before);
      errno = EILSEQ;
      return (size_t)(-1);
    }
  else
    {
      *incremented = false;
      return res;
    }
}
# else
#  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
     (*(incremented) = false, \
      iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
# endif

/* iconv_carefully_1 is like iconv_carefully, except that it stops after
   converting one character or one shift sequence.  */
static size_t
iconv_carefully_1 (iconv_t cd,
		   const char **inbuf, size_t *inbytesleft,
		   char **outbuf, size_t *outbytesleft,
		   bool *incremented)
{
  const char *inptr_before = *inbuf;
  const char *inptr = inptr_before;
  const char *inptr_end = inptr_before + *inbytesleft;
  char *outptr = *outbuf;
  size_t outsize = *outbytesleft;
  size_t res = (size_t)(-1);
  size_t insize;

  for (insize = 1; inptr_before + insize <= inptr_end; insize++)
    {
      inptr = inptr_before;
      res = iconv (cd,
		   (ICONV_CONST char **) &inptr, &insize,
		   &outptr, &outsize);
      if (!(res == (size_t)(-1) && errno == EINVAL))
	break;
      /* iconv can eat up a shift sequence but give EINVAL while attempting
	 to convert the first character.  E.g. libiconv does this.  */
      if (inptr > inptr_before)
	{
	  res = 0;
	  break;
	}
    }

  *inbuf = inptr;
  *inbytesleft = inptr_end - inptr;
# if !defined _LIBICONV_VERSION && !defined __GLIBC__
  /* Irix iconv() inserts a NUL byte if it cannot convert.
     NetBSD iconv() inserts a question mark if it cannot convert.
     Only GNU libiconv and GNU libc are known to prefer to fail rather
     than doing a lossy conversion.  */
  if (res != (size_t)(-1) && res > 0)
    {
      /* iconv() has already incremented INPTR.  We cannot go back to a
	 previous INPTR, otherwise the state inside CD would become invalid,
	 if FROM_CODESET is a stateful encoding.  So, tell the caller that
	 *INBUF has already been incremented.  */
      *incremented = (inptr > inptr_before);
      errno = EILSEQ;
      return (size_t)(-1);
    }
# endif

  if (res != (size_t)(-1))
    {
      *outbuf = outptr;
      *outbytesleft = outsize;
    }
  *incremented = false;
  return res;
}

/* utf8conv_carefully is like iconv, except that
     - it converts from UTF-8 to UTF-8,
     - it stops as soon as it encounters a conversion error, and it returns
       in *INCREMENTED a boolean telling whether it has incremented the input
       pointers past the error location,
     - if one_character_only is true, it stops after converting one
       character.  */
static size_t
utf8conv_carefully (bool one_character_only,
		    const char **inbuf, size_t *inbytesleft,
		    char **outbuf, size_t *outbytesleft,
		    bool *incremented)
{
  const char *inptr = *inbuf;
  size_t insize = *inbytesleft;
  char *outptr = *outbuf;
  size_t outsize = *outbytesleft;
  size_t res;

  res = 0;
  do
    {
      ucs4_t uc;
      int n;
      int m;

      n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
      if (n < 0)
	{
	  errno = (n == -2 ? EINVAL : EILSEQ);
	  n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
	  inptr += n;
	  insize -= n;
	  res = (size_t)(-1);
	  *incremented = true;
	  break;
	}
      if (outsize == 0)
	{
	  errno = E2BIG;
	  res = (size_t)(-1);
	  *incremented = false;
	  break;
	}
      m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
      if (m == -2)
	{
	  errno = E2BIG;
	  res = (size_t)(-1);
	  *incremented = false;
	  break;
	}
      inptr += n;
      insize -= n;
      if (m == -1)
	{
	  errno = EILSEQ;
	  res = (size_t)(-1);
	  *incremented = true;
	  break;
	}
      outptr += m;
      outsize -= m;
    }
  while (!one_character_only && insize > 0);

  *inbuf = inptr;
  *inbytesleft = insize;
  *outbuf = outptr;
  *outbytesleft = outsize;
  return res;
}

static int
mem_cd_iconveh_internal (const char *src, size_t srclen,
			 iconv_t cd, iconv_t cd1, iconv_t cd2,
			 enum iconv_ilseq_handler handler,
			 size_t extra_alloc,
			 size_t *offsets,
			 char **resultp, size_t *lengthp)
{
  /* When a conversion error occurs, we cannot start using CD1 and CD2 at
     this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
     Instead, we have to start afresh from the beginning of SRC.  */
  /* Use a temporary buffer, so that for small strings, a single malloc()
     call will be sufficient.  */
# define tmpbufsize 4096
  /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
     libiconv's UCS-4-INTERNAL encoding.  */
  union { unsigned int align; char buf[tmpbufsize]; } tmp;
# define tmpbuf tmp.buf

  char *initial_result;
  char *result;
  size_t allocated;
  size_t length;
  size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */

  if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
    {
      initial_result = *resultp;
      allocated = *lengthp;
    }
  else
    {
      initial_result = tmpbuf;
      allocated = sizeof (tmpbuf);
    }
  result = initial_result;

  /* Test whether a direct conversion is possible at all.  */
  if (cd == (iconv_t)(-1))
    goto indirectly;

  if (offsets != NULL)
    {
      size_t i;

      for (i = 0; i < srclen; i++)
	offsets[i] = (size_t)(-1);

      last_length = (size_t)(-1);
    }
  length = 0;

  /* First, try a direct conversion, and see whether a conversion error
     occurs at all.  */
  {
    const char *inptr = src;
    size_t insize = srclen;

    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
# if defined _LIBICONV_VERSION \
     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
    /* Set to the initial state.  */
    iconv (cd, NULL, NULL, NULL, NULL);
# endif

    while (insize > 0)
      {
	char *outptr = result + length;
	size_t outsize = allocated - extra_alloc - length;
	bool incremented;
	size_t res;
	bool grow;

	if (offsets != NULL)
	  {
	    if (length != last_length) /* ensure that offset[] be increasing */
	      {
		offsets[inptr - src] = length;
		last_length = length;
	      }
	    res = iconv_carefully_1 (cd,
				     &inptr, &insize,
				     &outptr, &outsize,
				     &incremented);
	  }
	else
	  /* Use iconv_carefully instead of iconv here, because:
	     - If TO_CODESET is UTF-8, we can do the error handling in this
	       loop, no need for a second loop,
	     - With iconv() implementations other than GNU libiconv and GNU
	       libc, if we use iconv() in a big swoop, checking for an E2BIG
	       return, we lose the number of irreversible conversions.  */
	  res = iconv_carefully (cd,
				 &inptr, &insize,
				 &outptr, &outsize,
				 &incremented);

	length = outptr - result;
	grow = (length + extra_alloc > allocated / 2);
	if (res == (size_t)(-1))
	  {
	    if (errno == E2BIG)
	      grow = true;
	    else if (errno == EINVAL)
	      break;
	    else if (errno == EILSEQ && handler != iconveh_error)
	      {
		if (cd2 == (iconv_t)(-1))
		  {
		    /* TO_CODESET is UTF-8.  */
		    /* Error handling can produce up to 1 byte of output.  */
		    if (length + 1 + extra_alloc > allocated)
		      {
			char *memory;

			allocated = 2 * allocated;
			if (length + 1 + extra_alloc > allocated)
			  abort ();
			if (result == initial_result)
			  memory = (char *) malloc (allocated);
			else
			  memory = (char *) realloc (result, allocated);
			if (memory == NULL)
			  {
			    if (result != initial_result)
			      free (result);
			    errno = ENOMEM;
			    return -1;
			  }
			if (result == initial_result)
			  memcpy (memory, initial_result, length);
			result = memory;
			grow = false;
		      }
		    /* The input is invalid in FROM_CODESET.  Eat up one byte
		       and emit a question mark.  */
		    if (!incremented)
		      {
			if (insize == 0)
			  abort ();
			inptr++;
			insize--;
		      }
		    result[length] = '?';
		    length++;
		  }
		else
		  goto indirectly;
	      }
	    else
	      {
		if (result != initial_result)
		  {
		    int saved_errno = errno;
		    free (result);
		    errno = saved_errno;
		  }
		return -1;
	      }
	  }
	if (insize == 0)
	  break;
	if (grow)
	  {
	    char *memory;

	    allocated = 2 * allocated;
	    if (result == initial_result)
	      memory = (char *) malloc (allocated);
	    else
	      memory = (char *) realloc (result, allocated);
	    if (memory == NULL)
	      {
		if (result != initial_result)
		  free (result);
		errno = ENOMEM;
		return -1;
	      }
	    if (result == initial_result)
	      memcpy (memory, initial_result, length);
	    result = memory;
	  }
      }
  }

  /* Now get the conversion state back to the initial state.
     But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
#if defined _LIBICONV_VERSION \
    || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
  for (;;)
    {
      char *outptr = result + length;
      size_t outsize = allocated - extra_alloc - length;
      size_t res;

      res = iconv (cd, NULL, NULL, &outptr, &outsize);
      length = outptr - result;
      if (res == (size_t)(-1))
	{
	  if (errno == E2BIG)
	    {
	      char *memory;

	      allocated = 2 * allocated;
	      if (result == initial_result)
		memory = (char *) malloc (allocated);
	      else
		memory = (char *) realloc (result, allocated);
	      if (memory == NULL)
		{
		  if (result != initial_result)
		    free (result);
		  errno = ENOMEM;
		  return -1;
		}
	      if (result == initial_result)
		memcpy (memory, initial_result, length);
	      result = memory;
	    }
	  else
	    {
	      if (result != initial_result)
		{
		  int saved_errno = errno;
		  free (result);
		  errno = saved_errno;
		}
	      return -1;
	    }
	}
      else
	break;
    }
#endif

  /* The direct conversion succeeded.  */
  goto done;

 indirectly:
  /* The direct conversion failed.
     Use a conversion through UTF-8.  */
  if (offsets != NULL)
    {
      size_t i;

      for (i = 0; i < srclen; i++)
	offsets[i] = (size_t)(-1);

      last_length = (size_t)(-1);
    }
  length = 0;
  {
    const bool slowly = (offsets != NULL || handler == iconveh_error);
# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
    char utf8buf[utf8bufsize + 1];
    size_t utf8len = 0;
    const char *in1ptr = src;
    size_t in1size = srclen;
    bool do_final_flush1 = true;
    bool do_final_flush2 = true;

    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
# if defined _LIBICONV_VERSION \
     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
    /* Set to the initial state.  */
    if (cd1 != (iconv_t)(-1))
      iconv (cd1, NULL, NULL, NULL, NULL);
    if (cd2 != (iconv_t)(-1))
      iconv (cd2, NULL, NULL, NULL, NULL);
# endif

    while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
      {
	char *out1ptr = utf8buf + utf8len;
	size_t out1size = utf8bufsize - utf8len;
	bool incremented1;
	size_t res1;
	int errno1;

	/* Conversion step 1: from FROM_CODESET to UTF-8.  */
	if (in1size > 0)
	  {
	    if (offsets != NULL
		&& length != last_length) /* ensure that offset[] be increasing */
	      {
		offsets[in1ptr - src] = length;
		last_length = length;
	      }
	    if (cd1 != (iconv_t)(-1))
	      {
		if (slowly)
		  res1 = iconv_carefully_1 (cd1,
					    &in1ptr, &in1size,
					    &out1ptr, &out1size,
					    &incremented1);
		else
		  res1 = iconv_carefully (cd1,
					  &in1ptr, &in1size,
					  &out1ptr, &out1size,
					  &incremented1);
	      }
	    else
	      {
		/* FROM_CODESET is UTF-8.  */
		res1 = utf8conv_carefully (slowly,
					   &in1ptr, &in1size,
					   &out1ptr, &out1size,
					   &incremented1);
	      }
	  }
	else if (do_final_flush1)
	  {
	    /* Now get the conversion state of CD1 back to the initial state.
	       But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
# if defined _LIBICONV_VERSION \
     || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
	    if (cd1 != (iconv_t)(-1))
	      res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
	    else
# endif
	      res1 = 0;
	    do_final_flush1 = false;
	    incremented1 = true;
	  }
	else
	  {
	    res1 = 0;
	    incremented1 = true;
	  }
	if (res1 == (size_t)(-1)
	    && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
	  {
	    if (result != initial_result)
	      {
		int saved_errno = errno;
		free (result);
		errno = saved_errno;
	      }
	    return -1;
	  }
	if (res1 == (size_t)(-1)
	    && errno == EILSEQ && handler != iconveh_error)
	  {
	    /* The input is invalid in FROM_CODESET.  Eat up one byte and
	       emit a question mark.  Room for the question mark was allocated
	       at the end of utf8buf.  */
	    if (!incremented1)
	      {
		if (in1size == 0)
		  abort ();
		in1ptr++;
		in1size--;
	      }
	    utf8buf[utf8len++] = '?';
	  }
	errno1 = errno;
	utf8len = out1ptr - utf8buf;

	if (offsets != NULL
	    || in1size == 0
	    || utf8len > utf8bufsize / 2
	    || (res1 == (size_t)(-1) && errno1 == E2BIG))
	  {
	    /* Conversion step 2: from UTF-8 to TO_CODESET.  */
	    const char *in2ptr = utf8buf;
	    size_t in2size = utf8len;

	    while (in2size > 0
		   || (in1size == 0 && !do_final_flush1 && do_final_flush2))
	      {
		char *out2ptr = result + length;
		size_t out2size = allocated - extra_alloc - length;
		bool incremented2;
		size_t res2;
		bool grow;

		if (in2size > 0)
		  {
		    if (cd2 != (iconv_t)(-1))
		      res2 = iconv_carefully (cd2,
					      &in2ptr, &in2size,
					      &out2ptr, &out2size,
					      &incremented2);
		    else
		      /* TO_CODESET is UTF-8.  */
		      res2 = utf8conv_carefully (false,
						 &in2ptr, &in2size,
						 &out2ptr, &out2size,
						 &incremented2);
		  }
		else /* in1size == 0 && !do_final_flush1
			&& in2size == 0 && do_final_flush2 */
		  {
		    /* Now get the conversion state of CD1 back to the initial
		       state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
# if defined _LIBICONV_VERSION \
     || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
		    if (cd2 != (iconv_t)(-1))
		      res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
		    else
# endif
		      res2 = 0;
		    do_final_flush2 = false;
		    incremented2 = true;
		  }

		length = out2ptr - result;
		grow = (length + extra_alloc > allocated / 2);
		if (res2 == (size_t)(-1))
		  {
		    if (errno == E2BIG)
		      grow = true;
		    else if (errno == EINVAL)
		      break;
		    else if (errno == EILSEQ && handler != iconveh_error)
		      {
			/* Error handling can produce up to 10 bytes of ASCII
			   output.  But TO_CODESET may be UCS-2, UTF-16 or
			   UCS-4, so use CD2 here as well.  */
			char scratchbuf[10];
			size_t scratchlen;
			ucs4_t uc;
			const char *inptr;
			size_t insize;
			size_t res;

			if (incremented2)
			  {
			    if (u8_prev (&uc, (const uint8_t *) in2ptr,
					 (const uint8_t *) utf8buf)
				== NULL)
			      abort ();
			  }
			else
			  {
			    int n;
			    if (in2size == 0)
			      abort ();
			    n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
						  in2size);
			    in2ptr += n;
			    in2size -= n;
			  }

			if (handler == iconveh_escape_sequence)
			  {
			    static char hex[16] = "0123456789ABCDEF";
			    scratchlen = 0;
			    scratchbuf[scratchlen++] = '\\';
			    if (uc < 0x10000)
			      scratchbuf[scratchlen++] = 'u';
			    else
			      {
				scratchbuf[scratchlen++] = 'U';
				scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
				scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
				scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
				scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
			      }
			    scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
			    scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
			    scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
			    scratchbuf[scratchlen++] = hex[uc & 15];
			  }
			else
			  {
			    scratchbuf[0] = '?';
			    scratchlen = 1;
			  }

			inptr = scratchbuf;
			insize = scratchlen;
			if (cd2 != (iconv_t)(-1))
			  res = iconv (cd2,
				       (ICONV_CONST char **) &inptr, &insize,
				       &out2ptr, &out2size);
			else
			  {
			    /* TO_CODESET is UTF-8.  */
			    if (out2size >= insize)
			      {
				memcpy (out2ptr, inptr, insize);
				out2ptr += insize;
				out2size -= insize;
				inptr += insize;
				insize = 0;
				res = 0;
			      }
			    else
			      {
				errno = E2BIG;
				res = (size_t)(-1);
			      }
			  }
			length = out2ptr - result;
			if (res == (size_t)(-1) && errno == E2BIG)
			  {
			    char *memory;

			    allocated = 2 * allocated;
			    if (length + 1 + extra_alloc > allocated)
			      abort ();
			    if (result == initial_result)
			      memory = (char *) malloc (allocated);
			    else
			      memory = (char *) realloc (result, allocated);
			    if (memory == NULL)
			      {
				if (result != initial_result)
				  free (result);
				errno = ENOMEM;
				return -1;
			      }
			    if (result == initial_result)
			      memcpy (memory, initial_result, length);
			    result = memory;
			    grow = false;

			    out2ptr = result + length;
			    out2size = allocated - extra_alloc - length;
			    if (cd2 != (iconv_t)(-1))
			      res = iconv (cd2,
					   (ICONV_CONST char **) &inptr,
					   &insize,
					   &out2ptr, &out2size);
			    else
			      {
				/* TO_CODESET is UTF-8.  */
				if (!(out2size >= insize))
				  abort ();
				memcpy (out2ptr, inptr, insize);
				out2ptr += insize;
				out2size -= insize;
				inptr += insize;
				insize = 0;
				res = 0;
			      }
			    length = out2ptr - result;
			  }
# if !defined _LIBICONV_VERSION && !defined __GLIBC__
			/* Irix iconv() inserts a NUL byte if it cannot convert.
			   NetBSD iconv() inserts a question mark if it cannot
			   convert.
			   Only GNU libiconv and GNU libc are known to prefer
			   to fail rather than doing a lossy conversion.  */
			if (res != (size_t)(-1) && res > 0)
			  {
			    errno = EILSEQ;
			    res = (size_t)(-1);
			  }
# endif
			if (res == (size_t)(-1))
			  {
			    /* Failure converting the ASCII replacement.  */
			    if (result != initial_result)
			      {
				int saved_errno = errno;
				free (result);
				errno = saved_errno;
			      }
			    return -1;
			  }
		      }
		    else
		      {
			if (result != initial_result)
			  {
			    int saved_errno = errno;
			    free (result);
			    errno = saved_errno;
			  }
			return -1;
		      }
		  }
		if (!(in2size > 0
		      || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
		  break;
		if (grow)
		  {
		    char *memory;

		    allocated = 2 * allocated;
		    if (result == initial_result)
		      memory = (char *) malloc (allocated);
		    else
		      memory = (char *) realloc (result, allocated);
		    if (memory == NULL)
		      {
			if (result != initial_result)
			  free (result);
			errno = ENOMEM;
			return -1;
		      }
		    if (result == initial_result)
		      memcpy (memory, initial_result, length);
		    result = memory;
		  }
	      }

	    /* Move the remaining bytes to the beginning of utf8buf.  */
	    if (in2size > 0)
	      memmove (utf8buf, in2ptr, in2size);
	    utf8len = in2size;
	  }

	if (res1 == (size_t)(-1))
	  {
	    if (errno1 == EINVAL)
	      in1size = 0;
	    else if (errno1 == EILSEQ)
	      {
		if (result != initial_result)
		  free (result);
		errno = errno1;
		return -1;
	      }
	  }
      }
# undef utf8bufsize
  }

 done:
  /* Now the final memory allocation.  */
  if (result == tmpbuf)
    {
      size_t memsize = length + extra_alloc;
      char *memory;

      memory = (char *) malloc (memsize > 0 ? memsize : 1);
      if (memory != NULL)
	{
	  memcpy (memory, tmpbuf, length);
	  result = memory;
	}
      else
	{
	  errno = ENOMEM;
	  return -1;
        }
    }
  else if (result != *resultp && length + extra_alloc < allocated)
    {
      /* Shrink the allocated memory if possible.  */
      size_t memsize = length + extra_alloc;
      char *memory;

      memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
      if (memory != NULL)
	result = memory;
    }
  *resultp = result;
  *lengthp = length;
  return 0;
# undef tmpbuf
# undef tmpbufsize
}

int
mem_cd_iconveh (const char *src, size_t srclen,
		iconv_t cd, iconv_t cd1, iconv_t cd2,
		enum iconv_ilseq_handler handler,
		size_t *offsets,
		char **resultp, size_t *lengthp)
{
  return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
				  offsets, resultp, lengthp);
}

char *
str_cd_iconveh (const char *src,
		iconv_t cd, iconv_t cd1, iconv_t cd2,
		enum iconv_ilseq_handler handler)
{
  /* For most encodings, a trailing NUL byte in the input will be converted
     to a trailing NUL byte in the output.  But not for UTF-7.  So that this
     function is usable for UTF-7, we have to exclude the NUL byte from the
     conversion and add it by hand afterwards.  */
  char *result = NULL;
  size_t length = 0;
  int retval = mem_cd_iconveh_internal (src, strlen (src),
					cd, cd1, cd2, handler, 1, NULL,
					&result, &length);

  if (retval < 0)
    {
      if (result != NULL)
	{
	  int saved_errno = errno;
	  free (result);
	  errno = saved_errno;
	}
      return NULL;
    }

  /* Add the terminating NUL byte.  */
  result[length] = '\0';

  return result;
}

#endif

int
mem_iconveh (const char *src, size_t srclen,
	     const char *from_codeset, const char *to_codeset,
	     enum iconv_ilseq_handler handler,
	     size_t *offsets,
	     char **resultp, size_t *lengthp)
{
  if (srclen == 0)
    {
      /* Nothing to convert.  */
      *lengthp = 0;
      return 0;
    }
  else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
    {
      char *result;

      if (*resultp != NULL && *lengthp >= srclen)
	result = *resultp;
      else
	{
	  result = (char *) malloc (srclen);
	  if (result == NULL)
	    {
	      errno = ENOMEM;
	      return -1;
	    }
	}
      memcpy (result, src, srclen);
      *resultp = result;
      *lengthp = srclen;
      return 0;
    }
  else
    {
#if HAVE_ICONV
      iconv_t cd;
      iconv_t cd1;
      iconv_t cd2;
      char *result;
      size_t length;
      int retval;

      /* Avoid glibc-2.1 bug with EUC-KR.  */
# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
      if (c_strcasecmp (from_codeset, "EUC-KR") == 0
	  || c_strcasecmp (to_codeset, "EUC-KR") == 0)
	{
	  errno = EINVAL;
	  return -1;
	}
# endif

      cd = iconv_open (to_codeset, from_codeset);

      if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
	cd1 = (iconv_t)(-1);
      else
	{
	  cd1 = iconv_open ("UTF-8", from_codeset);
	  if (cd1 == (iconv_t)(-1))
	    {
	      int saved_errno = errno;
	      if (cd != (iconv_t)(-1))
		iconv_close (cd);
	      errno = saved_errno;
	      return -1;
	    }
	}

      if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
	  || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
# endif
	 )
	cd2 = (iconv_t)(-1);
      else
	{
	  cd2 = iconv_open (to_codeset, "UTF-8");
	  if (cd2 == (iconv_t)(-1))
	    {
	      int saved_errno = errno;
	      if (cd1 != (iconv_t)(-1))
		iconv_close (cd1);
	      if (cd != (iconv_t)(-1))
		iconv_close (cd);
	      errno = saved_errno;
	      return -1;
	    }
	}

      result = *resultp;
      length = *lengthp;
      retval = mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, offsets,
			       &result, &length);

      if (retval < 0)
	{
	  /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv.  */
	  int saved_errno = errno;
	  if (cd2 != (iconv_t)(-1))
	    iconv_close (cd2);
	  if (cd1 != (iconv_t)(-1))
	    iconv_close (cd1);
	  if (cd != (iconv_t)(-1))
	    iconv_close (cd);
	  errno = saved_errno;
	}
      else
	{
	  if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
	    {
	      /* Return -1, but free the allocated memory, and while doing
		 that, preserve the errno from iconv_close.  */
	      int saved_errno = errno;
	      if (cd1 != (iconv_t)(-1))
		iconv_close (cd1);
	      if (cd != (iconv_t)(-1))
		iconv_close (cd);
	      if (result != *resultp && result != NULL)
		free (result);
	      errno = saved_errno;
	      return -1;
	    }
	  if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
	    {
	      /* Return -1, but free the allocated memory, and while doing
		 that, preserve the errno from iconv_close.  */
	      int saved_errno = errno;
	      if (cd != (iconv_t)(-1))
		iconv_close (cd);
	      if (result != *resultp && result != NULL)
		free (result);
	      errno = saved_errno;
	      return -1;
	    }
	  if (cd != (iconv_t)(-1) && iconv_close (cd) < 0)
	    {
	      /* Return -1, but free the allocated memory, and while doing
		 that, preserve the errno from iconv_close.  */
	      int saved_errno = errno;
	      if (result != *resultp && result != NULL)
		free (result);
	      errno = saved_errno;
	      return -1;
	    }
	  *resultp = result;
	  *lengthp = length;
	}
      return retval;
#else
      /* This is a different error code than if iconv_open existed but didn't
	 support from_codeset and to_codeset, so that the caller can emit
	 an error message such as
	   "iconv() is not supported. Installing GNU libiconv and
	    then reinstalling this package would fix this."  */
      errno = ENOSYS;
      return -1;
#endif
    }
}

char *
str_iconveh (const char *src,
	     const char *from_codeset, const char *to_codeset,
	     enum iconv_ilseq_handler handler)
{
  if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
    {
      char *result = strdup (src);

      if (result == NULL)
	errno = ENOMEM;
      return result;
    }
  else
    {
#if HAVE_ICONV
      iconv_t cd;
      iconv_t cd1;
      iconv_t cd2;
      char *result;

      /* Avoid glibc-2.1 bug with EUC-KR.  */
# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
      if (c_strcasecmp (from_codeset, "EUC-KR") == 0
	  || c_strcasecmp (to_codeset, "EUC-KR") == 0)
	{
	  errno = EINVAL;
	  return NULL;
	}
# endif

      cd = iconv_open (to_codeset, from_codeset);

      if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
	cd1 = (iconv_t)(-1);
      else
	{
	  cd1 = iconv_open ("UTF-8", from_codeset);
	  if (cd1 == (iconv_t)(-1))
	    {
	      int saved_errno = errno;
	      if (cd != (iconv_t)(-1))
		iconv_close (cd);
	      errno = saved_errno;
	      return NULL;
	    }
	}

      if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
	  || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
# endif
	 )
	cd2 = (iconv_t)(-1);
      else
	{
	  cd2 = iconv_open (to_codeset, "UTF-8");
	  if (cd2 == (iconv_t)(-1))
	    {
	      int saved_errno = errno;
	      if (cd1 != (iconv_t)(-1))
		iconv_close (cd1);
	      if (cd != (iconv_t)(-1))
		iconv_close (cd);
	      errno = saved_errno;
	      return NULL;
	    }
	}

      result = str_cd_iconveh (src, cd, cd1, cd2, handler);

      if (result == NULL)
	{
	  /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv.  */
	  int saved_errno = errno;
	  if (cd2 != (iconv_t)(-1))
	    iconv_close (cd2);
	  if (cd1 != (iconv_t)(-1))
	    iconv_close (cd1);
	  if (cd != (iconv_t)(-1))
	    iconv_close (cd);
	  errno = saved_errno;
	}
      else
	{
	  if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
	    {
	      /* Return NULL, but free the allocated memory, and while doing
		 that, preserve the errno from iconv_close.  */
	      int saved_errno = errno;
	      if (cd1 != (iconv_t)(-1))
		iconv_close (cd1);
	      if (cd != (iconv_t)(-1))
		iconv_close (cd);
	      free (result);
	      errno = saved_errno;
	      return NULL;
	    }
	  if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
	    {
	      /* Return NULL, but free the allocated memory, and while doing
		 that, preserve the errno from iconv_close.  */
	      int saved_errno = errno;
	      if (cd != (iconv_t)(-1))
		iconv_close (cd);
	      free (result);
	      errno = saved_errno;
	      return NULL;
	    }
	  if (cd != (iconv_t)(-1) && iconv_close (cd) < 0)
	    {
	      /* Return NULL, but free the allocated memory, and while doing
		 that, preserve the errno from iconv_close.  */
	      int saved_errno = errno;
	      free (result);
	      errno = saved_errno;
	      return NULL;
	    }
	}
      return result;
#else
      /* This is a different error code than if iconv_open existed but didn't
	 support from_codeset and to_codeset, so that the caller can emit
	 an error message such as
	   "iconv() is not supported. Installing GNU libiconv and
	    then reinstalling this package would fix this."  */
      errno = ENOSYS;
      return NULL;
#endif
    }
}
author	Bruno Haible <bruno@clisp.org>
date	Sun, 19 Oct 2008 03:36:00 +0200
parents	5991da96e696
children	ba1f5a03459a