Mercurial > hg > octave-kai > gnulib-hg
view lib/mbrtowc.c @ 16366:bb182ee4a09d
maint: replace FSF snail-mail addresses with URLs
* config/argz.mk, lib/accept4.c, lib/alignof.h, lib/alloca.in.h:
* lib/alphasort.c, lib/arcfour.c, lib/arcfour.h, lib/arctwo.c:
* lib/arctwo.h, lib/argz.c, lib/arpa_inet.in.h, lib/asnprintf.c:
* lib/asprintf.c, lib/assert.in.h, lib/base32.c, lib/base32.h:
* lib/base64.c, lib/base64.h, lib/c-ctype.c, lib/c-ctype.h:
* lib/c-strcase.h, lib/c-strcasecmp.c, lib/c-strncasecmp.c:
* lib/check-version.c, lib/check-version.h, lib/config.charset:
* lib/ctype.in.h, lib/des.c, lib/des.h, lib/dup3.c, lib/errno.in.h:
* lib/float+.h, lib/fnmatch.c, lib/fnmatch.in.h, lib/fnmatch_loop.c:
* lib/fseeko.c, lib/gai_strerror.c, lib/gc-gnulib.c:
* lib/gc-libgcrypt.c, lib/gc-pbkdf2-sha1.c, lib/gc.h:
* lib/getaddrinfo.c, lib/getdelim.c, lib/getfilecon.c, lib/getline.c:
* lib/getlogin_r.c, lib/getpass.c, lib/getpass.h, lib/gettext.h:
* lib/gettimeofday.c, lib/glob.in.h, lib/glthread/cond.c:
* lib/glthread/cond.h, lib/glthread/lock.c, lib/glthread/lock.h:
* lib/glthread/thread.c, lib/glthread/thread.h:
* lib/glthread/threadlib.c, lib/glthread/yield.h, lib/hmac-md5.c:
* lib/hmac-sha1.c, lib/hmac.h, lib/iconv.c, lib/iconv.in.h:
* lib/iconv_close.c, lib/iconv_open.c, lib/inet_ntop.c, lib/isfinite.c:
* lib/isinf.c, lib/iswblank.c, lib/langinfo.in.h, lib/link.c:
* lib/localcharset.c, lib/localcharset.h, lib/lseek.c, lib/malloc.c:
* lib/malloca.c, lib/malloca.h, lib/md2.c, lib/md2.h, lib/md4.c:
* lib/md4.h, lib/md5.c, lib/md5.h, lib/memmem.c, lib/mempcpy.c:
* lib/memset.c, lib/memxor.c, lib/memxor.h, lib/minmax.h, lib/mktime.c:
* lib/msvc-inval.c, lib/msvc-inval.h, lib/msvc-nothrow.c:
* lib/msvc-nothrow.h, lib/netdb.in.h, lib/netinet_in.in.h, lib/nproc.c:
* lib/nproc.h, lib/obstack_printf.c, lib/pathmax.h, lib/pipe.c:
* lib/pipe2.c, lib/poll.c, lib/poll.in.h, lib/printf-args.c:
* lib/printf-args.h, lib/printf-parse.c, lib/printf-parse.h:
* lib/pselect.c, lib/pthread.in.h, lib/pty-private.h, lib/pty.in.h:
* lib/read-file.c, lib/read-file.h, lib/ref-add.sin, lib/ref-del.sin:
* lib/regcomp.c, lib/regex.c, lib/regex.h, lib/regex_internal.c:
* lib/regex_internal.h, lib/regexec.c, lib/rijndael-alg-fst.c:
* lib/rijndael-alg-fst.h, lib/rijndael-api-fst.c:
* lib/rijndael-api-fst.h, lib/rint.c, lib/rintf.c, lib/rintl.c:
* lib/round.c, lib/roundf.c, lib/roundl.c, lib/scandir.c, lib/select.c:
* lib/sha1.c, lib/sha1.h, lib/size_max.h, lib/snprintf.c:
* lib/stdalign.in.h, lib/stdarg.in.h, lib/stdbool.in.h:
* lib/stddef.in.h, lib/stdint.in.h, lib/stdio.in.h, lib/str-kmp.h:
* lib/str-two-way.h, lib/strcasecmp.c, lib/strcasestr.c, lib/strdup.c:
* lib/striconv.c, lib/striconv.h, lib/string.in.h, lib/strings.in.h:
* lib/strncasecmp.c, lib/strndup.c, lib/strnlen.c, lib/strpbrk.c:
* lib/strptime.c, lib/strsep.c, lib/strstr.c, lib/strverscmp.c:
* lib/sys_file.in.h, lib/sys_ioctl.in.h, lib/sys_select.in.h:
* lib/sys_socket.in.h, lib/sys_stat.in.h, lib/sys_time.in.h:
* lib/sys_times.in.h, lib/sys_types.in.h, lib/sys_uio.in.h:
* lib/sys_utsname.in.h, lib/sys_wait.in.h, lib/tcgetsid.c:
* lib/termios.in.h, lib/time.in.h, lib/time_r.c, lib/timegm.c:
* lib/times.c, lib/unictype/3level.h, lib/unictype/3levelbit.h:
* lib/unistd.in.h, lib/vasnprintf.c, lib/vasnprintf.h, lib/vasprintf.c:
* lib/vsnprintf.c, lib/waitpid.c, lib/wchar.in.h, lib/wctype.in.h:
* lib/xsize.h, tests/test-closein.c, tests/test-des.c:
* tests/test-fclose.c, tests/test-fgetc.c, tests/test-filevercmp.c:
* tests/test-fputc.c, tests/test-fread.c, tests/test-fwrite.c:
* tests/test-gc-arcfour.c, tests/test-gc-arctwo.c, tests/test-gc-des.c:
* tests/test-gc-hmac-md5.c, tests/test-gc-hmac-sha1.c:
* tests/test-gc-md2.c, tests/test-gc-md4.c, tests/test-gc-md5.c:
* tests/test-gc-pbkdf2-sha1.c, tests/test-gc-rijndael.c:
* tests/test-gc-sha1.c, tests/test-gc.c, tests/test-getdelim.c:
* tests/test-getline.c, tests/test-getndelim2.c, tests/test-md2.c:
* tests/test-md4.c, tests/test-parse-datetime.c, tests/test-perror.c:
* tests/test-perror2.c, tests/test-pipe.c, tests/test-pipe2.c:
* tests/test-poll.c, tests/test-quotearg-simple.c:
* tests/test-quotearg.c, tests/test-quotearg.h:
* tests/test-round-ieee.c, tests/test-round1.c:
* tests/test-roundf-ieee.c, tests/test-roundf1.c:
* tests/test-roundl-ieee.c, tests/test-roundl.c:
* tests/test-safe-alloc.c, tests/test-sigpipe.c:
* tests/test-spawn-pipe-child.c, tests/test-spawn-pipe-main.c:
* tests/test-strerror.c, tests/test-strerror_r.c:
* tests/test-strsignal.c, tests/test-strverscmp.c:
* tests/test-xmemdup0.c:
Replace FSF snail mail addresses with URLs, as per GNU coding
standards. See glibc bug
<http://sourceware.org/bugzilla/show_bug.cgi?id=13673>.
author | Paul Eggert <eggert@cs.ucla.edu> |
---|---|
date | Thu, 09 Feb 2012 21:39:05 -0800 |
parents | 8250f2777afc |
children | a9e289a3a38d |
line wrap: on
line source
/* Convert multibyte character to wide character. Copyright (C) 1999-2002, 2005-2012 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2008. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <config.h> /* Specification. */ #include <wchar.h> #if GNULIB_defined_mbstate_t /* Implement mbrtowc() on top of mbtowc(). */ # include <errno.h> # include <stdlib.h> # include "localcharset.h" # include "streq.h" # include "verify.h" verify (sizeof (mbstate_t) >= 4); static char internal_state[4]; size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) { char *pstate = (char *)ps; if (s == NULL) { pwc = NULL; s = ""; n = 1; } if (n == 0) return (size_t)(-2); /* Here n > 0. */ if (pstate == NULL) pstate = internal_state; { size_t nstate = pstate[0]; char buf[4]; const char *p; size_t m; switch (nstate) { case 0: p = s; m = n; break; case 3: buf[2] = pstate[3]; /*FALLTHROUGH*/ case 2: buf[1] = pstate[2]; /*FALLTHROUGH*/ case 1: buf[0] = pstate[1]; p = buf; m = nstate; buf[m++] = s[0]; if (n >= 2 && m < 4) { buf[m++] = s[1]; if (n >= 3 && m < 4) buf[m++] = s[2]; } break; default: errno = EINVAL; return (size_t)(-1); } /* Here m > 0. */ # if __GLIBC__ || defined __UCLIBC__ /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */ mbtowc (NULL, NULL, 0); # endif { int res = mbtowc (pwc, p, m); if (res >= 0) { if (pwc != NULL && ((*pwc == 0) != (res == 0))) abort (); if (nstate >= (res > 0 ? res : 1)) abort (); res -= nstate; pstate[0] = 0; return res; } /* mbtowc does not distinguish between invalid and incomplete multibyte sequences. But mbrtowc needs to make this distinction. There are two possible approaches: - Use iconv() and its return value. - Use built-in knowledge about the possible encodings. Given the low quality of implementation of iconv() on the systems that lack mbrtowc(), we use the second approach. The possible encodings are: - 8-bit encodings, - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, - UTF-8. Use specialized code for each. */ if (m >= 4 || m >= MB_CUR_MAX) goto invalid; /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ { const char *encoding = locale_charset (); if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) { /* Cf. unistr/u8-mblen.c. */ unsigned char c = (unsigned char) p[0]; if (c >= 0xc2) { if (c < 0xe0) { if (m == 1) goto incomplete; } else if (c < 0xf0) { if (m == 1) goto incomplete; if (m == 2) { unsigned char c2 = (unsigned char) p[1]; if ((c2 ^ 0x80) < 0x40 && (c >= 0xe1 || c2 >= 0xa0) && (c != 0xed || c2 < 0xa0)) goto incomplete; } } else if (c <= 0xf4) { if (m == 1) goto incomplete; else /* m == 2 || m == 3 */ { unsigned char c2 = (unsigned char) p[1]; if ((c2 ^ 0x80) < 0x40 && (c >= 0xf1 || c2 >= 0x90) && (c < 0xf4 || (c == 0xf4 && c2 < 0x90))) { if (m == 2) goto incomplete; else /* m == 3 */ { unsigned char c3 = (unsigned char) p[2]; if ((c3 ^ 0x80) < 0x40) goto incomplete; } } } } } goto invalid; } /* As a reference for this code, you can use the GNU libiconv implementation. Look for uses of the RET_TOOFEW macro. */ if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) goto incomplete; } if (m == 2) { unsigned char c = (unsigned char) p[0]; if (c == 0x8f) { unsigned char c2 = (unsigned char) p[1]; if (c2 >= 0xa1 && c2 < 0xff) goto incomplete; } } goto invalid; } if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if (c >= 0xa1 && c < 0xff) goto incomplete; } goto invalid; } if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if ((c >= 0xa1 && c < 0xff) || c == 0x8e) goto incomplete; } else /* m == 2 || m == 3 */ { unsigned char c = (unsigned char) p[0]; if (c == 0x8e) goto incomplete; } goto invalid; } if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) goto incomplete; } else /* m == 2 || m == 3 */ { unsigned char c = (unsigned char) p[0]; if (c >= 0x90 && c <= 0xe3) { unsigned char c2 = (unsigned char) p[1]; if (c2 >= 0x30 && c2 <= 0x39) { if (m == 2) goto incomplete; else /* m == 3 */ { unsigned char c3 = (unsigned char) p[2]; if (c3 >= 0x81 && c3 <= 0xfe) goto incomplete; } } } } goto invalid; } if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) || (c >= 0xf0 && c <= 0xf9)) goto incomplete; } goto invalid; } /* An unknown multibyte encoding. */ goto incomplete; } incomplete: { size_t k = nstate; /* Here 0 <= k < m < 4. */ pstate[++k] = s[0]; if (k < m) { pstate[++k] = s[1]; if (k < m) pstate[++k] = s[2]; } if (k != m) abort (); } pstate[0] = m; return (size_t)(-2); invalid: errno = EILSEQ; /* The conversion state is undefined, says POSIX. */ return (size_t)(-1); } } } #else /* Override the system's mbrtowc() function. */ # undef mbrtowc size_t rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) { # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG if (s == NULL) { pwc = NULL; s = ""; n = 1; } # endif # if MBRTOWC_RETVAL_BUG { static mbstate_t internal_state; /* Override mbrtowc's internal state. We cannot call mbsinit() on the hidden internal state, but we can call it on our variable. */ if (ps == NULL) ps = &internal_state; if (!mbsinit (ps)) { /* Parse the rest of the multibyte character byte for byte. */ size_t count = 0; for (; n > 0; s++, n--) { wchar_t wc; size_t ret = mbrtowc (&wc, s, 1, ps); if (ret == (size_t)(-1)) return (size_t)(-1); count++; if (ret != (size_t)(-2)) { /* The multibyte character has been completed. */ if (pwc != NULL) *pwc = wc; return (wc == 0 ? 0 : count); } } return (size_t)(-2); } } # endif # if MBRTOWC_NUL_RETVAL_BUG { wchar_t wc; size_t ret = mbrtowc (&wc, s, n, ps); if (ret != (size_t)(-1) && ret != (size_t)(-2)) { if (pwc != NULL) *pwc = wc; if (wc == 0) ret = 0; } return ret; } # else { # if MBRTOWC_NULL_ARG1_BUG wchar_t dummy; if (pwc == NULL) pwc = &dummy; # endif return mbrtowc (pwc, s, n, ps); } # endif } #endif