view lib/unicase/u-ct-totitle.h @ 11702:daa1eb869953

Reduce the number of uc_is_cased calls.
author Bruno Haible <bruno@clisp.org>
date Wed, 01 Jul 2009 01:49:33 +0200
parents aa8baf2916ba
children e8d2c6fc33ad
line wrap: on
line source

/* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent).
   Copyright (C) 2009 Free Software Foundation, Inc.
   Written by Bruno Haible <bruno@clisp.org>, 2009.

   This program is free software: you can redistribute it and/or modify it
   under the terms of the GNU Lesser General Public License as published
   by the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

/* Quoting the Unicode standard, section "Default Case Algorithms":
     Find the word boundaries in X according to Unicode Standard Annex #29,
     “Text Boundaries.” For each word boundary, find the first cased character
     F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
     then map all characters C between F and the following word boundary to
     Lowercase_Mapping(C).  */

UNIT *
FUNC (const UNIT *s, size_t n,
      casing_prefix_context_t prefix_context,
      casing_suffix_context_t suffix_context,
      const char *iso639_language,
      uninorm_t nf,
      UNIT *resultbuf, size_t *lengthp)
{
  /* The result being accumulated.  */
  UNIT *result;
  size_t length;
  size_t allocated;
  /* An array containing the word break positions.  */
  char *wordbreaks;

  /* Initialize the accumulator.  */
  if (nf != NULL || resultbuf == NULL)
    {
      result = NULL;
      allocated = 0;
    }
  else
    {
      result = resultbuf;
      allocated = *lengthp;
    }
  length = 0;

  /* Initialize the word breaks array.  */
  if (n > 0)
    {
      wordbreaks = (char *) malloc (n);
      if (wordbreaks == NULL)
	{
	  errno = ENOMEM;
	  goto fail2;
	}
      U_WORDBREAKS (s, n, wordbreaks);
    }
  else
    wordbreaks = NULL;

  {
    const UNIT *s_end = s + n;
    const char *wp = wordbreaks;

    /* When considering the string as segmented by word boundaries: For each
       such segment:
	- In the first part, we are searching for the first cased character.
	  In this state, in_word_first_part = true, and no conversion takes
	  place.
	- In the second part, we are converting every character: the first
	  among these characters to title case, the other ones to lower case.
	  In this state, in_word_first_part = false.  */
    bool in_word_first_part = true;

    /* Helper for evaluating the FINAL_SIGMA condition:
       Last character that was not case-ignorable.  */
    ucs4_t last_char_except_ignorable =
      prefix_context.last_char_except_ignorable;

    /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
       Last character that was of combining class 230 ("Above") or 0.  */
    ucs4_t last_char_normal_or_above =
      prefix_context.last_char_normal_or_above;

    while (s < s_end)
      {
	/* Fetch the next character.  */
	ucs4_t uc;
	int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);

	ucs4_t (*single_character_map) (ucs4_t);
	size_t offset_in_rule; /* offset in 'struct special_casing_rule' */

	ucs4_t mapped_uc[3];
	unsigned int mapped_count;

	if (*wp)
	  /* Crossing a word boundary.  */
	  in_word_first_part = true;

	/* Determine single_character_map, offset_in_rule.
	   There are three possibilities:
	     - uc should not be converted.
	     - uc should be titlecased.
	     - uc should be lowercased.  */
	if (in_word_first_part)
	  {
	    if (uc_is_cased (uc))
	      {
		/* uc is to be titlecased.  */
		single_character_map = uc_totitle;
		offset_in_rule = offsetof (struct special_casing_rule, title[0]);
		in_word_first_part = false;
	      }
	    else
	      {
		/* uc is not converted.  */
		single_character_map = NULL;
		offset_in_rule = 0;
	      }
	  }
	else
	  {
	    /* uc is to be lowercased.  */
	    single_character_map = uc_tolower;
	    offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
	  }

	/* Actually map uc.  */
	if (single_character_map == NULL)
	  {
	    mapped_uc[0] = uc;
	    mapped_count = 1;
	    goto found_mapping;
	  }

	if (uc < 0x10000)
	  {
	    /* Look first in the special-casing table.  */
	    char code[3];

	    code[0] = (uc >> 8) & 0xff;
	    code[1] = uc & 0xff;

	    for (code[2] = 0; ; code[2]++)
	      {
		const struct special_casing_rule *rule =
		  gl_unicase_special_lookup (code, 3);

		if (rule == NULL)
		  break;

		/* Test if the condition applies.  */
		/* Does the language apply?  */
		if (rule->language[0] == '\0'
		    || (iso639_language != NULL
			&& iso639_language[0] == rule->language[0]
			&& iso639_language[1] == rule->language[1]))
		  {
		    /* Does the context apply?  */
		    int context = rule->context;
		    bool applies;

		    if (context < 0)
		      context = - context;
		    switch (context)
		      {
		      case SCC_ALWAYS:
			applies = true;
			break;

		      case SCC_FINAL_SIGMA:
			/* "Before" condition: preceded by a sequence
			   consisting of a cased letter and a case-ignorable
			   sequence.
			   "After" condition: not followed by a sequence
			   consisting of a case-ignorable sequence and then a
			   cased letter.  */
			/* Test the "before" condition.  */
			applies = uc_is_cased (last_char_except_ignorable);
			/* Test the "after" condition.  */
			if (applies)
			  {
			    const UNIT *s2 = s + count;
			    for (;;)
			      {
				if (s2 < s_end)
				  {
				    ucs4_t uc2;
				    int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
				    /* Our uc_is_case_ignorable function is
				       known to return false for all cased
				       characters.  So we can call
				       uc_is_case_ignorable first.  */
				    if (!uc_is_case_ignorable (uc2))
				      {
					applies = ! uc_is_cased (uc2);
					break;
				      }
				    s2 += count2;
				  }
				else
				  {
				    applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
				    break;
				  }
			      }
			  }
			break;

		      case SCC_AFTER_SOFT_DOTTED:
			/* "Before" condition: There is a Soft_Dotted character
			   before it, with no intervening character of
			   combining class 0 or 230 (Above).  */
			/* Test the "before" condition.  */
			applies = uc_is_property_soft_dotted (last_char_normal_or_above);
			break;

		      case SCC_MORE_ABOVE:
			/* "After" condition: followed by a character of
			   combining class 230 (Above) with no intervening
			   character of combining class 0 or 230 (Above).  */
			/* Test the "after" condition.  */
			{
			  const UNIT *s2 = s + count;
			  applies = false;
			  for (;;)
			    {
			      if (s2 < s_end)
				{
				  ucs4_t uc2;
				  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
				  int ccc = uc_combining_class (uc2);
				  if (ccc == UC_CCC_A)
				    {
				      applies = true;
				      break;
				    }
				  if (ccc == UC_CCC_NR)
				    break;
				  s2 += count2;
				}
			      else
				{
				  applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
				  break;
				}
			    }
			}
			break;

		      case SCC_BEFORE_DOT:
			/* "After" condition: followed by COMBINING DOT ABOVE
			   (U+0307). Any sequence of characters with a
			   combining class that is neither 0 nor 230 may
			   intervene between the current character and the
			   combining dot above.  */
			/* Test the "after" condition.  */
			{
			  const UNIT *s2 = s + count;
			  applies = false;
			  for (;;)
			    {
			      if (s2 < s_end)
				{
				  ucs4_t uc2;
				  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
				  if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
				    {
				      applies = true;
				      break;
				    }
				  {
				    int ccc = uc_combining_class (uc2);
				    if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
				      break;
				  }
				  s2 += count2;
				}
			      else
				{
				  applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
				  break;
				}
			    }
			}
			break;

		      case SCC_AFTER_I:
			/* "Before" condition: There is an uppercase I before
			   it, and there is no intervening character of
			   combining class 0 or 230 (Above).  */
			/* Test the "before" condition.  */
			applies = (last_char_normal_or_above == 'I');
			break;

		      default:
			abort ();
		      }
		    if (rule->context < 0)
		      applies = !applies;

		    if (applies)
		      {
			/* The rule applies.
			   Look up the mapping (0 to 3 characters).  */
			const unsigned short *mapped_in_rule =
			  (const unsigned short *)((const char *)rule + offset_in_rule);

			if (mapped_in_rule[0] == 0)
			  mapped_count = 0;
			else
			  {
			    mapped_uc[0] = mapped_in_rule[0];
			    if (mapped_in_rule[1] == 0)
			      mapped_count = 1;
			    else
			      {
				mapped_uc[1] = mapped_in_rule[1];
				if (mapped_in_rule[2] == 0)
				  mapped_count = 2;
				else
				  {
				    mapped_uc[2] = mapped_in_rule[2];
				    mapped_count = 3;
				  }
			      }
			  }
			goto found_mapping;
		      }
		  }

		/* Optimization: Save a hash table lookup in the next round.  */
		if (!rule->has_next)
		  break;
	      }
	  }

	/* No special-cased mapping.  So use the locale and context independent
	   mapping.  */
	mapped_uc[0] = single_character_map (uc);
	mapped_count = 1;

       found_mapping:
	/* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
	{
	  unsigned int i;

	  for (i = 0; i < mapped_count; i++)
	    {
	      ucs4_t muc = mapped_uc[i];

	      /* Append muc to the result accumulator.  */
	      if (length < allocated)
		{
		  int ret = U_UCTOMB (result + length, muc, allocated - length);
		  if (ret == -1)
		    {
		      errno = EINVAL;
		      goto fail1;
		    }
		  if (ret >= 0)
		    {
		      length += ret;
		      goto done_appending;
		    }
		}
	      {
		size_t old_allocated = allocated;
		size_t new_allocated = 2 * old_allocated;
		if (new_allocated < 64)
		  new_allocated = 64;
		if (new_allocated < old_allocated) /* integer overflow? */
		  abort ();
		{
		  UNIT *larger_result;
		  if (result == NULL)
		    {
		      larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
		      if (larger_result == NULL)
			{
			  errno = ENOMEM;
			  goto fail1;
			}
		    }
		  else if (result == resultbuf)
		    {
		      larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
		      if (larger_result == NULL)
			{
			  errno = ENOMEM;
			  goto fail1;
			}
		      U_CPY (larger_result, resultbuf, length);
		    }
		  else
		    {
		      larger_result =
			(UNIT *) realloc (result, new_allocated * sizeof (UNIT));
		      if (larger_result == NULL)
			{
			  errno = ENOMEM;
			  goto fail1;
			}
		    }
		  result = larger_result;
		  allocated = new_allocated;
		  {
		    int ret = U_UCTOMB (result + length, muc, allocated - length);
		    if (ret == -1)
		      {
			errno = EINVAL;
			goto fail1;
		      }
		    if (ret < 0)
		      abort ();
		    length += ret;
		    goto done_appending;
		  }
		}
	      }
	     done_appending: ;
	    }
	}

	if (!uc_is_case_ignorable (uc))
	  last_char_except_ignorable = uc;

	{
	  int ccc = uc_combining_class (uc);
	  if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
	    last_char_normal_or_above = uc;
	}

	s += count;
	wp += count;
      }
  }

  free (wordbreaks);

  if (nf != NULL)
    {
      /* Finally, normalize the result.  */
      UNIT *normalized_result;

      normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
      if (normalized_result == NULL)
	goto fail2;

      free (result);
      return normalized_result;
    }

  if (length == 0)
    {
      if (result == NULL)
	{
	  /* Return a non-NULL value.  NULL means error.  */
	  result = (UNIT *) malloc (1);
	  if (result == NULL)
	    {
	      errno = ENOMEM;
	      goto fail2;
	    }
	}
    }
  else if (result != resultbuf && length < allocated)
    {
      /* Shrink the allocated memory if possible.  */
      UNIT *memory;

      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
      if (memory != NULL)
	result = memory;
    }

  *lengthp = length;
  return result;

 fail1:
  {
    int saved_errno = errno;
    free (wordbreaks);
    errno = saved_errno;
  }
 fail2:
  if (result != resultbuf)
    {
      int saved_errno = errno;
      free (result);
      errno = saved_errno;
    }
  return NULL;
}