diff lib/unistr/u8-mbtouc-unsafe.c @ 13864:83949b090c98

unistr/u8-mbtouc: Improve handling of ill-formed UTF-8 input. * lib/unistr/u8-mbtouc.c (u8_mbtouc): For an invalid multibyte character, return the number of bytes that belong together, not always 1. * lib/unistr/u8-mbtouc-unsafe.c (u8_mbtouc_unsafe): Likewise. * lib/unistr/u8-mbtouc-aux.c (u8_mbtouc_aux): Likewise. * lib/unistr/u8-mbtouc-unsafe-aux.c (u8_mbtouc_unsafe_aux): Likewise. * lib/unistr/u8-mbsnlen.c (u8_mbsnlen): Use u8_mbtouc to determine the number of bytes of an invalid character. * tests/unistr/test-u8-mbtouc.c (test_safe_function): New function. (main): Invoke it. * tests/unistr/test-u8-mbtouc.h (test_function): Update two test results. * tests/unistr/test-u8-mbsnlen.c (main): Test various kinds of malformed byte sequences. * modules/unistr/u8-mbtouc (configure.ac): Bump version number. * modules/unistr/u8-mbtouc-unsafe (configure.ac): Likewise. * modules/unistr/u8-mbsnlen (configure.ac): Likewise. Reported by Ben Pfaff and Paolo Bonzini.
author Bruno Haible <bruno@clisp.org>
date Sat, 13 Nov 2010 19:43:06 +0100
parents c2cbabec01dd
children 97fc9a21a8fb
line wrap: on
line diff
--- a/lib/unistr/u8-mbtouc-unsafe.c
+++ b/lib/unistr/u8-mbtouc-unsafe.c
@@ -52,13 +52,15 @@
                          | (unsigned int) (s[1] ^ 0x80);
                   return 2;
                 }
+#if CONFIG_UNICODE_SAFETY
               /* invalid multibyte character */
+#endif
             }
           else
             {
               /* incomplete multibyte character */
               *puc = 0xfffd;
-              return n;
+              return 1;
             }
         }
       else if (c < 0xf0)
@@ -66,23 +68,39 @@
           if (n >= 3)
             {
 #if CONFIG_UNICODE_SAFETY
-              if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
-                  && (c >= 0xe1 || s[1] >= 0xa0)
-                  && (c != 0xed || s[1] < 0xa0))
-#endif
+              if ((s[1] ^ 0x80) < 0x40)
                 {
-                  *puc = ((unsigned int) (c & 0x0f) << 12)
-                         | ((unsigned int) (s[1] ^ 0x80) << 6)
-                         | (unsigned int) (s[2] ^ 0x80);
-                  return 3;
+                  if ((s[2] ^ 0x80) < 0x40)
+                    {
+                      if ((c >= 0xe1 || s[1] >= 0xa0)
+                          && (c != 0xed || s[1] < 0xa0))
+#endif
+                        {
+                          *puc = ((unsigned int) (c & 0x0f) << 12)
+                                 | ((unsigned int) (s[1] ^ 0x80) << 6)
+                                 | (unsigned int) (s[2] ^ 0x80);
+                          return 3;
+                        }
+#if CONFIG_UNICODE_SAFETY
+                      /* invalid multibyte character */
+                      *puc = 0xfffd;
+                      return 3;
+                    }
+                  /* invalid multibyte character */
+                  *puc = 0xfffd;
+                  return 2;
                 }
               /* invalid multibyte character */
+#endif
             }
           else
             {
               /* incomplete multibyte character */
               *puc = 0xfffd;
-              return n;
+              if (n == 1 || (s[1] ^ 0x80) >= 0x40)
+                return 1;
+              else
+                return 2;
             }
         }
       else if (c < 0xf8)
@@ -90,28 +108,51 @@
           if (n >= 4)
             {
 #if CONFIG_UNICODE_SAFETY
-              if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
-                  && (s[3] ^ 0x80) < 0x40
-                  && (c >= 0xf1 || s[1] >= 0x90)
+              if ((s[1] ^ 0x80) < 0x40)
+                {
+                  if ((s[2] ^ 0x80) < 0x40)
+                    {
+                      if ((s[3] ^ 0x80) < 0x40)
+                        {
+                          if ((c >= 0xf1 || s[1] >= 0x90)
 #if 1
-                  && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
+                              && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
+#endif
+                             )
 #endif
-                 )
-#endif
-                {
-                  *puc = ((unsigned int) (c & 0x07) << 18)
-                         | ((unsigned int) (s[1] ^ 0x80) << 12)
-                         | ((unsigned int) (s[2] ^ 0x80) << 6)
-                         | (unsigned int) (s[3] ^ 0x80);
-                  return 4;
+                            {
+                              *puc = ((unsigned int) (c & 0x07) << 18)
+                                     | ((unsigned int) (s[1] ^ 0x80) << 12)
+                                     | ((unsigned int) (s[2] ^ 0x80) << 6)
+                                     | (unsigned int) (s[3] ^ 0x80);
+                              return 4;
+                            }
+#if CONFIG_UNICODE_SAFETY
+                          /* invalid multibyte character */
+                          *puc = 0xfffd;
+                          return 4;
+                        }
+                      /* invalid multibyte character */
+                      *puc = 0xfffd;
+                      return 3;
+                    }
+                  /* invalid multibyte character */
+                  *puc = 0xfffd;
+                  return 2;
                 }
               /* invalid multibyte character */
+#endif
             }
           else
             {
               /* incomplete multibyte character */
               *puc = 0xfffd;
-              return n;
+              if (n == 1 || (s[1] ^ 0x80) >= 0x40)
+                return 1;
+              else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
+                return 2;
+              else
+                return 3;
             }
         }
 #if 0
@@ -120,19 +161,42 @@
           if (n >= 5)
             {
 #if CONFIG_UNICODE_SAFETY
-              if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
-                  && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
-                  && (c >= 0xf9 || s[1] >= 0x88))
-#endif
+              if ((s[1] ^ 0x80) < 0x40)
                 {
-                  *puc = ((unsigned int) (c & 0x03) << 24)
-                         | ((unsigned int) (s[1] ^ 0x80) << 18)
-                         | ((unsigned int) (s[2] ^ 0x80) << 12)
-                         | ((unsigned int) (s[3] ^ 0x80) << 6)
-                         | (unsigned int) (s[4] ^ 0x80);
-                  return 5;
+                  if ((s[2] ^ 0x80) < 0x40)
+                    {
+                      if ((s[3] ^ 0x80) < 0x40)
+                        {
+                          if ((s[4] ^ 0x80) < 0x40)
+                            {
+                              if (c >= 0xf9 || s[1] >= 0x88)
+#endif
+                                {
+                                  *puc = ((unsigned int) (c & 0x03) << 24)
+                                         | ((unsigned int) (s[1] ^ 0x80) << 18)
+                                         | ((unsigned int) (s[2] ^ 0x80) << 12)
+                                         | ((unsigned int) (s[3] ^ 0x80) << 6)
+                                         | (unsigned int) (s[4] ^ 0x80);
+                                  return 5;
+                                }
+#if CONFIG_UNICODE_SAFETY
+                              /* invalid multibyte character */
+                              *puc = 0xfffd;
+                              return 5;
+                            }
+                          /* invalid multibyte character */
+                          *puc = 0xfffd;
+                          return 4;
+                        }
+                      /* invalid multibyte character */
+                      *puc = 0xfffd;
+                      return 3;
+                    }
+                  /* invalid multibyte character */
+                  return 2;
                 }
               /* invalid multibyte character */
+#endif
             }
           else
             {
@@ -146,21 +210,49 @@
           if (n >= 6)
             {
 #if CONFIG_UNICODE_SAFETY
-              if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
-                  && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
-                  && (s[5] ^ 0x80) < 0x40
-                  && (c >= 0xfd || s[1] >= 0x84))
-#endif
+              if ((s[1] ^ 0x80) < 0x40)
                 {
-                  *puc = ((unsigned int) (c & 0x01) << 30)
-                         | ((unsigned int) (s[1] ^ 0x80) << 24)
-                         | ((unsigned int) (s[2] ^ 0x80) << 18)
-                         | ((unsigned int) (s[3] ^ 0x80) << 12)
-                         | ((unsigned int) (s[4] ^ 0x80) << 6)
-                         | (unsigned int) (s[5] ^ 0x80);
-                  return 6;
+                  if ((s[2] ^ 0x80) < 0x40)
+                    {
+                      if ((s[3] ^ 0x80) < 0x40)
+                        {
+                          if ((s[4] ^ 0x80) < 0x40)
+                            {
+                              if ((s[5] ^ 0x80) < 0x40)
+                                {
+                                  if (c >= 0xfd || s[1] >= 0x84)
+#endif
+                                    {
+                                      *puc = ((unsigned int) (c & 0x01) << 30)
+                                             | ((unsigned int) (s[1] ^ 0x80) << 24)
+                                             | ((unsigned int) (s[2] ^ 0x80) << 18)
+                                             | ((unsigned int) (s[3] ^ 0x80) << 12)
+                                             | ((unsigned int) (s[4] ^ 0x80) << 6)
+                                             | (unsigned int) (s[5] ^ 0x80);
+                                      return 6;
+                                    }
+#if CONFIG_UNICODE_SAFETY
+                                  /* invalid multibyte character */
+                                  *puc = 0xfffd;
+                                  return 6;
+                                }
+                              /* invalid multibyte character */
+                              *puc = 0xfffd;
+                              return 5;
+                            }
+                          /* invalid multibyte character */
+                          *puc = 0xfffd;
+                          return 4;
+                        }
+                      /* invalid multibyte character */
+                      *puc = 0xfffd;
+                      return 3;
+                    }
+                  /* invalid multibyte character */
+                  return 2;
                 }
               /* invalid multibyte character */
+#endif
             }
           else
             {