diff lib/regex.h @ 16361:ab59b5080051

regex: merge glibc changes * lib/regcomp.c (init_dfa): Tighten overflow checks to test for IDX_MAX too, since IDX_MAX can be much less than SIZE_MAX. (init_word_char): Work even if bitset words are not exactly 32 or 64 bits wide. Don't assume there are no padding bits. * lib/regex.c [_LIBC]: Do not include <config.h>. [!_LIBC]: Add pragmas to ignore -Wsuggest-attributes=pure and -Wtype-limits. * lib/regex.h (__USE_GNU): Renamed from __USE_GNU_REGEX, to avoid needless disagreement with glibc. All uses changed. Define it to 1 only if _GNU_SOURCE, to match glibc. (_REG_RM_NAME): Remove; no longer needed, since the names in question are now all protected by __USE_GNU. (_REG_RE_NAME): Remove; replaced by glibc's __REPB_PREFIX. (REG_TRANSLATE_TYPE): Remove; replaced by glibc's __RE_TRANSLATE_TYPE. * lib/regex_internal.h (MIN): New macro. 2012-01-03 Ulrich Drepper <drepper@gmail.com> * lib/regcomp.c (init_word_char): Optimize regex a bit. 2011-12-30 Jakub Jelinek <jakub@redhat.com> * lib/regex_internal.c (re_string_fetch_byte_case): Fix up regcomp/regexec. The problem is that parse_bracket_symbol is miscompiled, and it turns out it is because of an incorrect attribute on re_string_fetch_byte_case. Unlike re_string_peek_byte_case, this one is really not pure, it modifies memory (increments pstr->cur_idx), and with the pure attribute GCC assumed it doesn't and it cached the presumed value of regexp->cur_idx in a variable across the for (;; ++i) { if (i >= BRACKET_NAME_BUF_SIZE) return REG_EBRACK; if (token->type == OP_OPEN_CHAR_CLASS) ch = re_string_fetch_byte_case (regexp); else ch = re_string_fetch_byte (regexp); if (re_string_eoi(regexp)) return REG_EBRACK; if (ch == delim && re_string_peek_byte (regexp, 0) == ']') break; elem->opr.name[i] = ch; } 2011-11-29 Andreas Schwab <schwab@redhat.com> * lib/regcomp.c (build_equiv_class): Fix access after end of search string in regex matcher. 2011-11-12 Ulrich Drepper <drepper@redhat.com> * lib/regex_internal.c, lib/regex_internal.h: Fix warnings in regex. 2011-10-12 Ulrich Drepper <drepper@redhat.com> * lib/regcomp.c (parse_branch): One more regex memory leak fixed. 2011-10-11 Ulrich Drepper <drepper@redhat.com> * lib/regcomp.c (parse_branch, parse_sub_exp): More regex memory leak fixes and tests. (parse_sub_exp, parse_bracket_exp): Fix memory leak for some invalid regular expressions. 2011-05-28 Ulrich Drepper <drepper@gmail.com> * lib/regex_internal.c, lib/regexec.c: Fix unnecessary overallocation due to incomplete character. When incomplete characters are found at the end of a string the code ran amok and allocated lots of memory. Stricter limits are now in place. 2011-05-20 Reuben Thomas <rrt@sc3d.org> * lib/regex.h: Update documentation. 2011-05-16 Aharon Robbins <arnold@skeeve.com> * lib/regex.h: Update RE_SYNTAX*_AWK constants. 2010-05-05 Andreas Schwab <schwab@redhat.com> * lib/regexec.c (find_collation_sequence_value): Fix lookup of collation sequence value during regexp matching. 2010-01-22 Ulrich Drepper <drepper@redhat.com> * lib/regex_internal.c (re_dfa_add_node): Extend overflow detection. 2008-01-16 Ulrich Drepper <drepper@redhat.com> * lib/regex.h: Cleanup namespace. 2007-11-26 Ulrich Drepper <drepper@redhat.com> * lib/regex.h (REG_ENOSYS): Define REG_ENOSYS also for __USE_XOPEN2K. 2007-08-26 Ulrich Drepper <drepper@redhat.com> * lib/regex_internal.h: Prevent some declarations and definitions to be seen when used in tests. 2005-05-06 Ulrich Drepper <drepper@redhat.com> * lib/regex_internal.h: Include bits/libc-lock.h or define dummy __libc_lock_* macros if not _LIBC. (struct re_dfa_t): Add lock.
author Paul Eggert <eggert@cs.ucla.edu>
date Tue, 07 Feb 2012 22:47:01 -0800
parents 18a38c9615f0
children bb182ee4a09d
line wrap: on
line diff
--- a/lib/regex.h
+++ b/lib/regex.h
@@ -1,6 +1,6 @@
 /* Definitions for data structures and routines for the regular
    expression library.
-   Copyright (C) 1985, 1989-1993, 1995-1998, 2000-2003, 2005-2006, 2009-2012
+   Copyright (C) 1985, 1989-1993, 1995-1998, 2000-2003, 2005-2012
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -28,13 +28,10 @@
 extern "C" {
 #endif
 
-/* Define __USE_GNU_REGEX to declare GNU extensions that violate the
+/* Define __USE_GNU to declare GNU extensions that violate the
    POSIX name space rules.  */
-#undef __USE_GNU_REGEX
-#if (defined _GNU_SOURCE					\
-     || (!defined _POSIX_C_SOURCE && !defined _POSIX_SOURCE	\
-	 && !defined _XOPEN_SOURCE))
-# define __USE_GNU_REGEX 1
+#ifdef _GNU_SOURCE
+# define __USE_GNU 1
 #endif
 
 #ifdef _REGEX_LARGE_OFFSETS
@@ -45,16 +42,6 @@
    supported within glibc itself, and glibc users should not define
    _REGEX_LARGE_OFFSETS.  */
 
-/* The type of the offset of a byte within a string.
-   For historical reasons POSIX 1003.1-2004 requires that regoff_t be
-   at least as wide as off_t.  However, many common POSIX platforms set
-   regoff_t to the more-sensible ssize_t and the Open Group has
-   signalled its intention to change the requirement to be that
-   regoff_t be at least as wide as ptrdiff_t and ssize_t; see XBD ERN
-   60 (2005-08-25).  We don't know of any hosts where ssize_t or
-   ptrdiff_t is wider than ssize_t, so ssize_t is safe.  */
-typedef ssize_t regoff_t;
-
 /* The type of nonnegative object indexes.  Traditionally, GNU regex
    uses 'int' for these.  Code that uses __re_idx_t should work
    regardless of whether the type is signed.  */
@@ -69,10 +56,8 @@
 
 #else
 
-/* Use types that are binary-compatible with the traditional GNU regex
-   implementation, which mishandles strings longer than INT_MAX.  */
-
-typedef int regoff_t;
+/* The traditional GNU regex implementation mishandles strings longer
+   than INT_MAX.  */
 typedef int __re_idx_t;
 typedef unsigned int __re_size_t;
 typedef unsigned long int __re_long_size_t;
@@ -93,8 +78,7 @@
    add or remove a bit, only one other definition need change.  */
 typedef unsigned long int reg_syntax_t;
 
-#ifdef __USE_GNU_REGEX
-
+#ifdef __USE_GNU
 /* If this bit is not set, then \ inside a bracket expression is literal.
    If set, then such a \ quotes the following character.  */
 # define RE_BACKSLASH_ESCAPE_IN_LISTS ((unsigned long int) 1)
@@ -225,8 +209,7 @@
 /* If this bit is set, then no_sub will be set to 1 during
    re_compile_pattern.  */
 # define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)
-
-#endif /* defined __USE_GNU_REGEX */
+#endif
 
 /* This global variable defines the particular regexp syntax to use (for
    some interfaces).  When a regexp is compiled, the syntax used is
@@ -234,7 +217,7 @@
    already-compiled regexps.  */
 extern reg_syntax_t re_syntax_options;
 
-#ifdef __USE_GNU_REGEX
+#ifdef __USE_GNU
 /* Define combinations of the above bits for the standard possibilities.
    (The [[[ comments delimit what gets put into the Texinfo file, so
    don't delete them!)  */
@@ -246,16 +229,19 @@
    | RE_NO_BK_PARENS              | RE_NO_BK_REFS			\
    | RE_NO_BK_VBAR                | RE_NO_EMPTY_RANGES			\
    | RE_DOT_NEWLINE		  | RE_CONTEXT_INDEP_ANCHORS		\
+   | RE_CHAR_CLASSES							\
    | RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS)
 
 # define RE_SYNTAX_GNU_AWK						\
-  ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DEBUG)	\
-   & ~(RE_DOT_NOT_NULL | RE_INTERVALS | RE_CONTEXT_INDEP_OPS		\
-       | RE_CONTEXT_INVALID_OPS ))
+  ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS		\
+    | RE_INVALID_INTERVAL_ORD)						\
+   & ~(RE_DOT_NOT_NULL | RE_CONTEXT_INDEP_OPS				\
+      | RE_CONTEXT_INVALID_OPS ))
 
 # define RE_SYNTAX_POSIX_AWK						\
   (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS		\
-   | RE_INTERVALS	    | RE_NO_GNU_OPS)
+   | RE_INTERVALS	    | RE_NO_GNU_OPS				\
+   | RE_INVALID_INTERVAL_ORD)
 
 # define RE_SYNTAX_GREP							\
   (RE_BK_PLUS_QM              | RE_CHAR_CLASSES				\
@@ -306,10 +292,6 @@
    | RE_NO_BK_VBAR	    | RE_UNMATCHED_RIGHT_PAREN_ORD)
 /* [[[end syntaxes]]] */
 
-#endif /* defined __USE_GNU_REGEX */
-
-#ifdef __USE_GNU_REGEX
-
 /* Maximum number of duplicates an interval can allow.  POSIX-conforming
    systems might define this in <limits.h>, but we want our
    value, so remove any previous define.  */
@@ -325,8 +307,7 @@
    actually used a pattern like a\{214748363\}, so RE_DUP_MAX retains
    its historical value.  */
 # define RE_DUP_MAX (0x7fff)
-
-#endif /* defined __USE_GNU_REGEX */
+#endif
 
 
 /* POSIX 'cflags' bits (i.e., information for 'regcomp').  */
@@ -396,7 +377,7 @@
   _REG_ERPAREN		/* Unmatched ) or \); not returned from regcomp.  */
 } reg_errcode_t;
 
-#ifdef _XOPEN_SOURCE
+#if defined _XOPEN_SOURCE || defined __USE_XOPEN2K
 # define REG_ENOSYS	_REG_ENOSYS
 #endif
 #define REG_NOERROR	_REG_NOERROR
@@ -417,62 +398,51 @@
 #define REG_ESIZE	_REG_ESIZE
 #define REG_ERPAREN	_REG_ERPAREN
 
-/* struct re_pattern_buffer normally uses member names like 'buffer'
-   that POSIX does not allow.  In POSIX mode these members have names
-   with leading 're_' (e.g., 're_buffer').  */
-#ifdef __USE_GNU_REGEX
-# define _REG_RE_NAME(id) id
-# define _REG_RM_NAME(id) id
-#else
-# define _REG_RE_NAME(id) re_##id
-# define _REG_RM_NAME(id) rm_##id
+/* This data structure represents a compiled pattern.  Before calling
+   the pattern compiler, the fields 'buffer', 'allocated', 'fastmap',
+   and 'translate' can be set.  After the pattern has been compiled,
+   the fields 're_nsub', 'not_bol' and 'not_eol' are available.  All
+   other fields are private to the regex routines.  */
+
+#ifndef RE_TRANSLATE_TYPE
+# define __RE_TRANSLATE_TYPE unsigned char *
+# ifdef __USE_GNU
+#  define RE_TRANSLATE_TYPE __RE_TRANSLATE_TYPE
+# endif
 #endif
 
-/* The user can specify the type of the re_translate member by
-   defining the macro RE_TRANSLATE_TYPE, which defaults to unsigned
-   char *.  This pollutes the POSIX name space, so in POSIX mode just
-   use unsigned char *.  */
-#ifdef __USE_GNU_REGEX
-# ifndef RE_TRANSLATE_TYPE
-#  define RE_TRANSLATE_TYPE unsigned char *
-# endif
-# define REG_TRANSLATE_TYPE RE_TRANSLATE_TYPE
+#ifdef __USE_GNU
+# define __REPB_PREFIX(name) name
 #else
-# define REG_TRANSLATE_TYPE unsigned char *
+# define __REPB_PREFIX(name) __##name
 #endif
 
-/* This data structure represents a compiled pattern.  Before calling
-   the pattern compiler, the fields 'buffer', 'allocated', 'fastmap',
-   'translate', and 'no_sub' can be set.  After the pattern has been
-   compiled, the 're_nsub' field is available.  All other fields are
-   private to the regex routines.  */
-
 struct re_pattern_buffer
 {
   /* Space that holds the compiled pattern.  It is declared as
      'unsigned char *' because its elements are sometimes used as
      array indexes.  */
-  unsigned char *_REG_RE_NAME (buffer);
+  unsigned char *__REPB_PREFIX(buffer);
 
   /* Number of bytes to which 'buffer' points.  */
-  __re_long_size_t _REG_RE_NAME (allocated);
+  __re_long_size_t __REPB_PREFIX(allocated);
 
   /* Number of bytes actually used in 'buffer'.  */
-  __re_long_size_t _REG_RE_NAME (used);
+  __re_long_size_t __REPB_PREFIX(used);
 
   /* Syntax setting with which the pattern was compiled.  */
-  reg_syntax_t _REG_RE_NAME (syntax);
+  reg_syntax_t __REPB_PREFIX(syntax);
 
   /* Pointer to a fastmap, if any, otherwise zero.  re_search uses the
      fastmap, if there is one, to skip over impossible starting points
      for matches.  */
-  char *_REG_RE_NAME (fastmap);
+  char *__REPB_PREFIX(fastmap);
 
   /* Either a translate table to apply to all characters before
      comparing them, or zero for no translation.  The translation is
      applied to a pattern when it is compiled and to a string when it
      is matched.  */
-  REG_TRANSLATE_TYPE _REG_RE_NAME (translate);
+  __RE_TRANSLATE_TYPE __REPB_PREFIX(translate);
 
   /* Number of subexpressions found by the compiler.  */
   size_t re_nsub;
@@ -481,57 +451,70 @@
      Well, in truth it's used only in 're_search_2', to see whether or
      not we should use the fastmap, so we don't set this absolutely
      perfectly; see 're_compile_fastmap' (the "duplicate" case).  */
-  unsigned int _REG_RE_NAME (can_be_null) : 1;
+  unsigned __REPB_PREFIX(can_be_null) : 1;
 
   /* If REGS_UNALLOCATED, allocate space in the 'regs' structure
      for 'max (RE_NREGS, re_nsub + 1)' groups.
      If REGS_REALLOCATE, reallocate space if necessary.
      If REGS_FIXED, use what's there.  */
-#ifdef __USE_GNU_REGEX
+#ifdef __USE_GNU
 # define REGS_UNALLOCATED 0
 # define REGS_REALLOCATE 1
 # define REGS_FIXED 2
 #endif
-  unsigned int _REG_RE_NAME (regs_allocated) : 2;
+  unsigned __REPB_PREFIX(regs_allocated) : 2;
 
   /* Set to zero when 're_compile_pattern' compiles a pattern; set to
      one by 're_compile_fastmap' if it updates the fastmap.  */
-  unsigned int _REG_RE_NAME (fastmap_accurate) : 1;
+  unsigned __REPB_PREFIX(fastmap_accurate) : 1;
 
   /* If set, 're_match_2' does not return information about
      subexpressions.  */
-  unsigned int _REG_RE_NAME (no_sub) : 1;
+  unsigned __REPB_PREFIX(no_sub) : 1;
 
   /* If set, a beginning-of-line anchor doesn't match at the beginning
      of the string.  */
-  unsigned int _REG_RE_NAME (not_bol) : 1;
+  unsigned __REPB_PREFIX(not_bol) : 1;
 
   /* Similarly for an end-of-line anchor.  */
-  unsigned int _REG_RE_NAME (not_eol) : 1;
+  unsigned __REPB_PREFIX(not_eol) : 1;
 
   /* If true, an anchor at a newline matches.  */
-  unsigned int _REG_RE_NAME (newline_anchor) : 1;
-
-/* [[[end pattern_buffer]]] */
+  unsigned __REPB_PREFIX(newline_anchor) : 1;
 };
 
 typedef struct re_pattern_buffer regex_t;
 
+/* Type for byte offsets within the string.  POSIX mandates this.  */
+#ifdef _REGEX_LARGE_OFFSETS
+/* POSIX 1003.1-2008 requires that regoff_t be at least as wide as
+   ptrdiff_t and ssize_t.  We don't know of any hosts where ptrdiff_t
+   is wider than ssize_t, so ssize_t is safe.  */
+typedef ssize_t regoff_t;
+#else
+/* The traditional GNU regex implementation mishandles strings longer
+   than INT_MAX.  */
+typedef int regoff_t;
+#endif
+
+
+#ifdef __USE_GNU
 /* This is the structure we store register match data in.  See
    regex.texinfo for a full description of what registers match.  */
 struct re_registers
 {
-  __re_size_t _REG_RM_NAME (num_regs);
-  regoff_t *_REG_RM_NAME (start);
-  regoff_t *_REG_RM_NAME (end);
+  __re_size_t num_regs;
+  regoff_t *start;
+  regoff_t *end;
 };
 
 
 /* If 'regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
    're_match_2' returns information about at least this many registers
    the first time a 'regs' structure is passed.  */
-#if !defined RE_NREGS && defined __USE_GNU_REGEX
-# define RE_NREGS 30
+# ifndef RE_NREGS
+#  define RE_NREGS 30
+# endif
 #endif
 
 
@@ -546,13 +529,19 @@
 
 /* Declarations for routines.  */
 
+#ifdef __USE_GNU
 /* Sets the current default syntax to SYNTAX, and return the old syntax.
    You can also simply assign to the 're_syntax_options' variable.  */
 extern reg_syntax_t re_set_syntax (reg_syntax_t __syntax);
 
 /* Compile the regular expression PATTERN, with length LENGTH
    and syntax given by the global 're_syntax_options', into the buffer
-   BUFFER.  Return NULL if successful, and an error string if not.  */
+   BUFFER.  Return NULL if successful, and an error string if not.
+
+   To free the allocated storage, you must call 'regfree' on BUFFER.
+   Note that the translate table must either have been initialised by
+   'regcomp', with a malloc'ed value, or set to NULL before calling
+   'regfree'.  */
 extern const char *re_compile_pattern (const char *__pattern, size_t __length,
 				       struct re_pattern_buffer *__buffer);
 
@@ -609,14 +598,15 @@
    register data.
 
    Unless this function is called, the first search or match using
-   BUFFER will allocate its own register data, without freeing the old
-   data.  */
+   BUFFER will allocate its own register data, without
+   freeing the old data.  */
 extern void re_set_registers (struct re_pattern_buffer *__buffer,
 			      struct re_registers *__regs,
 			      __re_size_t __num_regs,
 			      regoff_t *__starts, regoff_t *__ends);
+#endif	/* Use GNU */
 
-#if defined _REGEX_RE_COMP || defined _LIBC
+#if defined _REGEX_RE_COMP || (defined _LIBC && defined __USE_BSD)
 # ifndef _CRAY
 /* 4.2 bsd compatibility.  */
 extern char *re_comp (const char *);