Mercurial > hg > octave-shane > gnulib-hg
annotate lib/mbfile.h @ 12421:e8d2c6fc33ad
Use spaces for indentation, not tabs.
author | Bruno Haible <bruno@clisp.org> |
---|---|
date | Thu, 10 Dec 2009 20:28:30 +0100 |
parents | bbbbbf4cd1c5 |
children | b5e42ef33b49 |
rev | line source |
---|---|
6046 | 1 /* Multibyte character I/O: macros for multi-byte encodings. |
2 Copyright (C) 2001, 2005 Free Software Foundation, Inc. | |
3 | |
9309
bbbbbf4cd1c5
Change copyright notice from GPLv2+ to GPLv3+.
Bruno Haible <bruno@clisp.org>
parents:
6046
diff
changeset
|
4 This program is free software: you can redistribute it and/or modify |
6046 | 5 it under the terms of the GNU General Public License as published by |
9309
bbbbbf4cd1c5
Change copyright notice from GPLv2+ to GPLv3+.
Bruno Haible <bruno@clisp.org>
parents:
6046
diff
changeset
|
6 the Free Software Foundation; either version 3 of the License, or |
bbbbbf4cd1c5
Change copyright notice from GPLv2+ to GPLv3+.
Bruno Haible <bruno@clisp.org>
parents:
6046
diff
changeset
|
7 (at your option) any later version. |
6046 | 8 |
9 This program is distributed in the hope that it will be useful, | |
10 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 GNU General Public License for more details. | |
13 | |
14 You should have received a copy of the GNU General Public License | |
9309
bbbbbf4cd1c5
Change copyright notice from GPLv2+ to GPLv3+.
Bruno Haible <bruno@clisp.org>
parents:
6046
diff
changeset
|
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ |
6046 | 16 |
17 /* Written by Mitsuru Chinen <mchinen@yamato.ibm.com> | |
18 and Bruno Haible <bruno@clisp.org>. */ | |
19 | |
20 /* The macros in this file implement multi-byte character input from a | |
21 stream. | |
22 | |
23 mb_file_t | |
24 is the type for multibyte character input stream, usable for variable | |
25 declarations. | |
26 | |
27 mbf_char_t | |
28 is the type for multibyte character or EOF, usable for variable | |
29 declarations. | |
30 | |
31 mbf_init (mbf, stream) | |
32 initializes the MB_FILE for reading from stream. | |
33 | |
34 mbf_getc (mbc, mbf) | |
35 reads the next multibyte character from mbf and stores it in mbc. | |
36 | |
37 mb_iseof (mbc) | |
38 returns true if mbc represents the EOF value. | |
39 | |
40 Here are the function prototypes of the macros. | |
41 | |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
42 extern void mbf_init (mb_file_t mbf, FILE *stream); |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
43 extern void mbf_getc (mbf_char_t mbc, mb_file_t mbf); |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
44 extern bool mb_iseof (const mbf_char_t mbc); |
6046 | 45 */ |
46 | |
47 #ifndef _MBFILE_H | |
48 #define _MBFILE_H 1 | |
49 | |
50 #include <assert.h> | |
51 #include <stdbool.h> | |
52 #include <stdio.h> | |
53 #include <string.h> | |
54 | |
55 /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before | |
56 <wchar.h>. | |
57 BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before | |
58 <wchar.h>. */ | |
59 #include <stdio.h> | |
60 #include <time.h> | |
61 #include <wchar.h> | |
62 | |
63 #include "mbchar.h" | |
64 | |
65 struct mbfile_multi { | |
66 FILE *fp; | |
67 bool eof_seen; | |
68 bool have_pushback; | |
69 mbstate_t state; | |
70 unsigned int bufcount; | |
71 char buf[MBCHAR_BUF_SIZE]; | |
72 struct mbchar pushback; | |
73 }; | |
74 | |
75 static inline void | |
76 mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) | |
77 { | |
78 size_t bytes; | |
79 | |
80 /* If EOF has already been seen, don't use getc. This matters if | |
81 mbf->fp is connected to an interactive tty. */ | |
82 if (mbf->eof_seen) | |
83 goto eof; | |
84 | |
85 /* Return character pushed back, if there is one. */ | |
86 if (mbf->have_pushback) | |
87 { | |
88 mb_copy (mbc, &mbf->pushback); | |
89 mbf->have_pushback = false; | |
90 return; | |
91 } | |
92 | |
93 /* Before using mbrtowc, we need at least one byte. */ | |
94 if (mbf->bufcount == 0) | |
95 { | |
96 int c = getc (mbf->fp); | |
97 if (c == EOF) | |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
98 { |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
99 mbf->eof_seen = true; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
100 goto eof; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
101 } |
6046 | 102 mbf->buf[0] = (unsigned char) c; |
103 mbf->bufcount++; | |
104 } | |
105 | |
106 /* Handle most ASCII characters quickly, without calling mbrtowc(). */ | |
107 if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0])) | |
108 { | |
109 /* These characters are part of the basic character set. ISO C 99 | |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
110 guarantees that their wide character code is identical to their |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
111 char code. */ |
6046 | 112 mbc->wc = mbc->buf[0] = mbf->buf[0]; |
113 mbc->wc_valid = true; | |
114 mbc->ptr = &mbc->buf[0]; | |
115 mbc->bytes = 1; | |
116 mbf->bufcount = 0; | |
117 return; | |
118 } | |
119 | |
120 /* Use mbrtowc on an increasing number of bytes. Read only as many bytes | |
121 from mbf->fp as needed. This is needed to give reasonable interactive | |
122 behaviour when mbf->fp is connected to an interactive tty. */ | |
123 for (;;) | |
124 { | |
125 /* We don't know whether the 'mbrtowc' function updates the state when | |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
126 it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
127 not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour. We |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
128 don't have an autoconf test for this, yet. |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
129 The new behaviour would allow us to feed the bytes one by one into |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
130 mbrtowc. But the old behaviour forces us to feed all bytes since |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
131 the end of the last character into mbrtowc. Since we want to retry |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
132 with more bytes when mbrtowc returns -2, we must backup the state |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
133 before calling mbrtowc, because implementations with the new |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
134 behaviour will clobber it. */ |
6046 | 135 mbstate_t backup_state = mbf->state; |
136 | |
137 bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state); | |
138 | |
139 if (bytes == (size_t) -1) | |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
140 { |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
141 /* An invalid multibyte sequence was encountered. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
142 /* Return a single byte. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
143 bytes = 1; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
144 mbc->wc_valid = false; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
145 break; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
146 } |
6046 | 147 else if (bytes == (size_t) -2) |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
148 { |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
149 /* An incomplete multibyte character. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
150 mbf->state = backup_state; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
151 if (mbf->bufcount == MBCHAR_BUF_SIZE) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
152 { |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
153 /* An overlong incomplete multibyte sequence was encountered. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
154 /* Return a single byte. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
155 bytes = 1; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
156 mbc->wc_valid = false; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
157 break; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
158 } |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
159 else |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
160 { |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
161 /* Read one more byte and retry mbrtowc. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
162 int c = getc (mbf->fp); |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
163 if (c == EOF) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
164 { |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
165 /* An incomplete multibyte character at the end. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
166 mbf->eof_seen = true; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
167 bytes = mbf->bufcount; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
168 mbc->wc_valid = false; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
169 break; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
170 } |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
171 mbf->buf[mbf->bufcount] = (unsigned char) c; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
172 mbf->bufcount++; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
173 } |
6046 | 174 } |
175 else | |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
176 { |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
177 if (bytes == 0) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
178 { |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
179 /* A null wide character was encountered. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
180 bytes = 1; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
181 assert (mbf->buf[0] == '\0'); |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
182 assert (mbc->wc == 0); |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
183 } |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
184 mbc->wc_valid = true; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
185 break; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
186 } |
6046 | 187 } |
188 | |
189 /* Return the multibyte sequence mbf->buf[0..bytes-1]. */ | |
190 mbc->ptr = &mbc->buf[0]; | |
191 memcpy (&mbc->buf[0], &mbf->buf[0], bytes); | |
192 mbc->bytes = bytes; | |
193 | |
194 mbf->bufcount -= bytes; | |
195 if (mbf->bufcount > 0) | |
196 { | |
197 /* It's not worth calling memmove() for so few bytes. */ | |
198 unsigned int count = mbf->bufcount; | |
199 char *p = &mbf->buf[0]; | |
200 | |
201 do | |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
202 { |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
203 *p = *(p + bytes); |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
204 p++; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
205 } |
6046 | 206 while (--count > 0); |
207 } | |
208 return; | |
209 | |
210 eof: | |
211 /* An mbchar_t with bytes == 0 is used to indicate EOF. */ | |
212 mbc->ptr = NULL; | |
213 mbc->bytes = 0; | |
214 mbc->wc_valid = false; | |
215 return; | |
216 } | |
217 | |
218 static inline void | |
219 mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf) | |
220 { | |
221 mb_copy (&mbf->pushback, mbc); | |
222 mbf->have_pushback = true; | |
223 } | |
224 | |
225 typedef struct mbfile_multi mb_file_t; | |
226 | |
227 typedef mbchar_t mbf_char_t; | |
228 | |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
229 #define mbf_init(mbf, stream) \ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
230 ((mbf).fp = (stream), \ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
231 (mbf).eof_seen = false, \ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
232 (mbf).have_pushback = false, \ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
233 memset (&(mbf).state, '\0', sizeof (mbstate_t)), \ |
6046 | 234 (mbf).bufcount = 0) |
235 | |
236 #define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf)) | |
237 | |
238 #define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf)) | |
239 | |
240 #define mb_iseof(mbc) ((mbc).bytes == 0) | |
241 | |
242 #endif /* _MBFILE_H */ |