Subversion Repositories Kolibri OS

Rev

Rev 4874 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
#include 
2
#include 
3
#include 
4
#include "mbctype.h"
5
#include 
6
#include 
7
#include 
8
#include "local.h"
9
 
10
int (*__mbtowc) (struct _reent *, wchar_t *, const char *, size_t,
11
		 const char *, mbstate_t *)
12
#ifdef __CYGWIN__
13
   /* Cygwin starts up in UTF-8 mode. */
14
   = __utf8_mbtowc;
15
#else
16
   = __ascii_mbtowc;
17
#endif
18
 
19
int
20
_DEFUN (_mbtowc_r, (r, pwc, s, n, state),
21
        struct _reent *r   _AND
4921 Serge 22
        wchar_t       *__restrict pwc _AND
23
        const char    *__restrict s   _AND
4349 Serge 24
        size_t         n   _AND
25
        mbstate_t      *state)
26
{
27
  return __mbtowc (r, pwc, s, n, __locale_charset (), state);
28
}
29
 
30
int
31
_DEFUN (__ascii_mbtowc, (r, pwc, s, n, charset, state),
32
        struct _reent *r       _AND
33
        wchar_t       *pwc     _AND
34
        const char    *s       _AND
35
        size_t         n       _AND
36
	const char    *charset _AND
37
        mbstate_t      *state)
38
{
39
  wchar_t dummy;
40
  unsigned char *t = (unsigned char *)s;
41
 
42
  if (pwc == NULL)
43
    pwc = &dummy;
44
 
45
  if (s == NULL)
46
    return 0;
47
 
48
  if (n == 0)
49
    return -2;
50
 
51
#ifdef __CYGWIN__
52
  if ((wchar_t)*t >= 0x80)
53
    {
54
      r->_errno = EILSEQ;
55
      return -1;
56
    }
57
#endif
58
 
59
  *pwc = (wchar_t)*t;
60
 
61
  if (*t == '\0')
62
    return 0;
63
 
64
  return 1;
65
}
66
 
67
#ifdef _MB_CAPABLE
68
typedef enum { ESCAPE, DOLLAR, BRACKET, AT, B, J,
69
               NUL, JIS_CHAR, OTHER, JIS_C_NUM } JIS_CHAR_TYPE;
70
typedef enum { ASCII, JIS, A_ESC, A_ESC_DL, JIS_1, J_ESC, J_ESC_BR,
71
               INV, JIS_S_NUM } JIS_STATE;
72
typedef enum { COPY_A, COPY_J1, COPY_J2, MAKE_A, NOOP, EMPTY, ERROR } JIS_ACTION;
73
 
74
/**************************************************************************************
75
 * state/action tables for processing JIS encoding
76
 * Where possible, switches to JIS are grouped with proceding JIS characters and switches
77
 * to ASCII are grouped with preceding JIS characters.  Thus, maximum returned length
78
 * is 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6.
79
 *************************************************************************************/
80
 
4921 Serge 81
#ifndef  __CYGWIN__
4349 Serge 82
static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = {
83
/*              ESCAPE   DOLLAR    BRACKET   AT       B       J        NUL      JIS_CHAR  OTHER */
84
/* ASCII */   { A_ESC,   ASCII,    ASCII,    ASCII,   ASCII,  ASCII,   ASCII,   ASCII,    ASCII },
85
/* JIS */     { J_ESC,   JIS_1,    JIS_1,    JIS_1,   JIS_1,  JIS_1,   INV,     JIS_1,    INV },
86
/* A_ESC */   { ASCII,   A_ESC_DL, ASCII,    ASCII,   ASCII,  ASCII,   ASCII,   ASCII,    ASCII },
87
/* A_ESC_DL */{ ASCII,   ASCII,    ASCII,    JIS,     JIS,    ASCII,   ASCII,   ASCII,    ASCII },
88
/* JIS_1 */   { INV,     JIS,      JIS,      JIS,     JIS,    JIS,     INV,     JIS,      INV },
89
/* J_ESC */   { INV,     INV,      J_ESC_BR, INV,     INV,    INV,     INV,     INV,      INV },
90
/* J_ESC_BR */{ INV,     INV,      INV,      INV,     ASCII,  ASCII,   INV,     INV,      INV },
91
};
92
 
93
static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = {
94
/*              ESCAPE   DOLLAR    BRACKET   AT       B        J        NUL      JIS_CHAR  OTHER */
95
/* ASCII */   { NOOP,    COPY_A,   COPY_A,   COPY_A,  COPY_A,  COPY_A,  EMPTY,   COPY_A,  COPY_A},
96
/* JIS */     { NOOP,    COPY_J1,  COPY_J1,  COPY_J1, COPY_J1, COPY_J1, ERROR,   COPY_J1, ERROR },
97
/* A_ESC */   { COPY_A,  NOOP,     COPY_A,   COPY_A,  COPY_A,  COPY_A,  COPY_A,  COPY_A,  COPY_A},
98
/* A_ESC_DL */{ COPY_A,  COPY_A,   COPY_A,   NOOP,    NOOP,    COPY_A,  COPY_A,  COPY_A,  COPY_A},
99
/* JIS_1 */   { ERROR,   COPY_J2,  COPY_J2,  COPY_J2, COPY_J2, COPY_J2, ERROR,   COPY_J2, ERROR },
100
/* J_ESC */   { ERROR,   ERROR,    NOOP,     ERROR,   ERROR,   ERROR,   ERROR,   ERROR,   ERROR },
101
/* J_ESC_BR */{ ERROR,   ERROR,    ERROR,    ERROR,   MAKE_A,  MAKE_A,  ERROR,   ERROR,   ERROR },
102
};
4921 Serge 103
#endif /* !__CYGWIN__ */
4349 Serge 104
 
105
/* we override the mbstate_t __count field for more complex encodings and use it store a state value */
106
#define __state __count
107
 
108
#ifdef _MB_EXTENDED_CHARSETS_ISO
109
int
110
_DEFUN (__iso_mbtowc, (r, pwc, s, n, charset, state),
111
        struct _reent *r       _AND
112
        wchar_t       *pwc     _AND
113
        const char    *s       _AND
114
        size_t         n       _AND
115
	const char    *charset _AND
116
        mbstate_t      *state)
117
{
118
  wchar_t dummy;
119
  unsigned char *t = (unsigned char *)s;
120
 
121
  if (pwc == NULL)
122
    pwc = &dummy;
123
 
124
  if (s == NULL)
125
    return 0;
126
 
127
  if (n == 0)
128
    return -2;
129
 
130
  if (*t >= 0xa0)
131
    {
132
      int iso_idx = __iso_8859_index (charset + 9);
133
      if (iso_idx >= 0)
134
	{
135
	  *pwc = __iso_8859_conv[iso_idx][*t - 0xa0];
136
	  if (*pwc == 0) /* Invalid character */
137
	    {
138
	      r->_errno = EILSEQ;
139
	      return -1;
140
	    }
141
	  return 1;
142
	}
143
    }
144
 
145
  *pwc = (wchar_t) *t;
146
 
147
  if (*t == '\0')
148
    return 0;
149
 
150
  return 1;
151
}
152
#endif /* _MB_EXTENDED_CHARSETS_ISO */
153
 
154
#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
155
int
156
_DEFUN (__cp_mbtowc, (r, pwc, s, n, charset, state),
157
        struct _reent *r       _AND
158
        wchar_t       *pwc     _AND
159
        const char    *s       _AND
160
        size_t         n       _AND
161
	const char    *charset _AND
162
        mbstate_t      *state)
163
{
164
  wchar_t dummy;
165
  unsigned char *t = (unsigned char *)s;
166
 
167
  if (pwc == NULL)
168
    pwc = &dummy;
169
 
170
  if (s == NULL)
171
    return 0;
172
 
173
  if (n == 0)
174
    return -2;
175
 
176
  if (*t >= 0x80)
177
    {
178
      int cp_idx = __cp_index (charset + 2);
179
      if (cp_idx >= 0)
180
	{
181
	  *pwc = __cp_conv[cp_idx][*t - 0x80];
182
	  if (*pwc == 0) /* Invalid character */
183
	    {
184
	      r->_errno = EILSEQ;
185
	      return -1;
186
	    }
187
	  return 1;
188
	}
189
    }
190
 
191
  *pwc = (wchar_t)*t;
192
 
193
  if (*t == '\0')
194
    return 0;
195
 
196
  return 1;
197
}
198
#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
199
 
200
int
201
_DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state),
202
        struct _reent *r       _AND
203
        wchar_t       *pwc     _AND
204
        const char    *s       _AND
205
        size_t         n       _AND
206
	const char    *charset _AND
207
        mbstate_t      *state)
208
{
209
  wchar_t dummy;
210
  unsigned char *t = (unsigned char *)s;
211
  int ch;
212
  int i = 0;
213
 
214
  if (pwc == NULL)
215
    pwc = &dummy;
216
 
217
  if (s == NULL)
218
    return 0;
219
 
220
  if (n == 0)
221
    return -2;
222
 
223
  if (state->__count == 0)
224
    ch = t[i++];
225
  else
226
    ch = state->__value.__wchb[0];
227
 
228
  if (ch == '\0')
229
    {
230
      *pwc = 0;
231
      state->__count = 0;
232
      return 0; /* s points to the null character */
233
    }
234
 
235
  if (ch <= 0x7f)
236
    {
237
      /* single-byte sequence */
238
      state->__count = 0;
239
      *pwc = ch;
240
      return 1;
241
    }
242
  if (ch >= 0xc0 && ch <= 0xdf)
243
    {
244
      /* two-byte sequence */
245
      state->__value.__wchb[0] = ch;
246
      if (state->__count == 0)
247
	state->__count = 1;
248
      else if (n < (size_t)-1)
249
	++n;
250
      if (n < 2)
251
	return -2;
252
      ch = t[i++];
253
      if (ch < 0x80 || ch > 0xbf)
254
	{
255
	  r->_errno = EILSEQ;
256
	  return -1;
257
	}
258
      if (state->__value.__wchb[0] < 0xc2)
259
	{
260
	  /* overlong UTF-8 sequence */
261
	  r->_errno = EILSEQ;
262
	  return -1;
263
	}
264
      state->__count = 0;
265
      *pwc = (wchar_t)((state->__value.__wchb[0] & 0x1f) << 6)
266
	|    (wchar_t)(ch & 0x3f);
267
      return i;
268
    }
269
  if (ch >= 0xe0 && ch <= 0xef)
270
    {
271
      /* three-byte sequence */
272
      wchar_t tmp;
273
      state->__value.__wchb[0] = ch;
274
      if (state->__count == 0)
275
	state->__count = 1;
276
      else if (n < (size_t)-1)
277
	++n;
278
      if (n < 2)
279
	return -2;
280
      ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
281
      if (state->__value.__wchb[0] == 0xe0 && ch < 0xa0)
282
	{
283
	  /* overlong UTF-8 sequence */
284
	  r->_errno = EILSEQ;
285
	  return -1;
286
	}
287
      if (ch < 0x80 || ch > 0xbf)
288
	{
289
	  r->_errno = EILSEQ;
290
	  return -1;
291
	}
292
      state->__value.__wchb[1] = ch;
293
      if (state->__count == 1)
294
	state->__count = 2;
295
      else if (n < (size_t)-1)
296
	++n;
297
      if (n < 3)
298
	return -2;
299
      ch = t[i++];
300
      if (ch < 0x80 || ch > 0xbf)
301
	{
302
	  r->_errno = EILSEQ;
303
	  return -1;
304
	}
305
      state->__count = 0;
306
      tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12)
307
	|    (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6)
308
	|     (wchar_t)(ch & 0x3f);
309
      *pwc = tmp;
310
      return i;
311
    }
312
  if (ch >= 0xf0 && ch <= 0xf4)
313
    {
314
      /* four-byte sequence */
315
      wint_t tmp;
316
      state->__value.__wchb[0] = ch;
317
      if (state->__count == 0)
318
	state->__count = 1;
319
      else if (n < (size_t)-1)
320
	++n;
321
      if (n < 2)
322
	return -2;
323
      ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
324
      if ((state->__value.__wchb[0] == 0xf0 && ch < 0x90)
325
	  || (state->__value.__wchb[0] == 0xf4 && ch >= 0x90))
326
	{
327
	  /* overlong UTF-8 sequence or result is > 0x10ffff */
328
	  r->_errno = EILSEQ;
329
	  return -1;
330
	}
331
      if (ch < 0x80 || ch > 0xbf)
332
	{
333
	  r->_errno = EILSEQ;
334
	  return -1;
335
	}
336
      state->__value.__wchb[1] = ch;
337
      if (state->__count == 1)
338
	state->__count = 2;
339
      else if (n < (size_t)-1)
340
	++n;
341
      if (n < 3)
342
	return -2;
343
      ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
344
      if (ch < 0x80 || ch > 0xbf)
345
	{
346
	  r->_errno = EILSEQ;
347
	  return -1;
348
	}
349
      state->__value.__wchb[2] = ch;
350
      if (state->__count == 2)
351
	state->__count = 3;
352
      else if (n < (size_t)-1)
353
	++n;
354
      if (state->__count == 3 && sizeof(wchar_t) == 2)
355
	{
356
	  /* On systems which have wchar_t being UTF-16 values, the value
357
	     doesn't fit into a single wchar_t in this case.  So what we
358
	     do here is to store the state with a special value of __count
359
	     and return the first half of a surrogate pair.  The first
360
	     three bytes of a UTF-8 sequence are enough to generate the
361
	     first half of a UTF-16 surrogate pair.  As return value we
362
	     choose to return the number of bytes actually read up to
363
	     here.
364
	     The second half of the surrogate pair is returned in case we
365
	     recognize the special __count value of four, and the next
366
	     byte is actually a valid value.  See below. */
367
	  tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
368
	    |   (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
369
	    |   (wint_t)((state->__value.__wchb[2] & 0x3f) << 6);
370
	  state->__count = 4;
371
	  *pwc = 0xd800 | ((tmp - 0x10000) >> 10);
372
	  return i;
373
	}
374
      if (n < 4)
375
	return -2;
376
      ch = t[i++];
377
      if (ch < 0x80 || ch > 0xbf)
378
	{
379
	  r->_errno = EILSEQ;
380
	  return -1;
381
	}
382
      tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
383
	|   (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
384
	|   (wint_t)((state->__value.__wchb[2] & 0x3f) << 6)
385
	|   (wint_t)(ch & 0x3f);
386
      if (state->__count == 4 && sizeof(wchar_t) == 2)
387
	/* Create the second half of the surrogate pair for systems with
388
	   wchar_t == UTF-16 . */
389
	*pwc = 0xdc00 | (tmp & 0x3ff);
390
      else
391
	*pwc = tmp;
392
      state->__count = 0;
393
      return i;
394
    }
395
 
396
  r->_errno = EILSEQ;
397
  return -1;
398
}
399
 
400
/* Cygwin defines its own doublebyte charset conversion functions
401
   because the underlying OS requires wchar_t == UTF-16. */
402
#ifndef  __CYGWIN__
403
int
404
_DEFUN (__sjis_mbtowc, (r, pwc, s, n, charset, state),
405
        struct _reent *r       _AND
406
        wchar_t       *pwc     _AND
407
        const char    *s       _AND
408
        size_t         n       _AND
409
	const char    *charset _AND
410
        mbstate_t      *state)
411
{
412
  wchar_t dummy;
413
  unsigned char *t = (unsigned char *)s;
414
  int ch;
415
  int i = 0;
416
 
417
  if (pwc == NULL)
418
    pwc = &dummy;
419
 
420
  if (s == NULL)
421
    return 0;  /* not state-dependent */
422
 
423
  if (n == 0)
424
    return -2;
425
 
426
  ch = t[i++];
427
  if (state->__count == 0)
428
    {
429
      if (_issjis1 (ch))
430
	{
431
	  state->__value.__wchb[0] = ch;
432
	  state->__count = 1;
433
	  if (n <= 1)
434
	    return -2;
435
	  ch = t[i++];
436
	}
437
    }
438
  if (state->__count == 1)
439
    {
440
      if (_issjis2 (ch))
441
	{
442
	  *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch;
443
	  state->__count = 0;
444
	  return i;
445
	}
446
      else
447
	{
448
	  r->_errno = EILSEQ;
449
	  return -1;
450
	}
451
    }
452
 
453
  *pwc = (wchar_t)*t;
454
 
455
  if (*t == '\0')
456
    return 0;
457
 
458
  return 1;
459
}
460
 
461
int
462
_DEFUN (__eucjp_mbtowc, (r, pwc, s, n, charset, state),
463
        struct _reent *r       _AND
464
        wchar_t       *pwc     _AND
465
        const char    *s       _AND
466
        size_t         n       _AND
467
	const char    *charset _AND
468
        mbstate_t      *state)
469
{
470
  wchar_t dummy;
471
  unsigned char *t = (unsigned char *)s;
472
  int ch;
473
  int i = 0;
474
 
475
  if (pwc == NULL)
476
    pwc = &dummy;
477
 
478
  if (s == NULL)
479
    return 0;
480
 
481
  if (n == 0)
482
    return -2;
483
 
484
  ch = t[i++];
485
  if (state->__count == 0)
486
    {
487
      if (_iseucjp1 (ch))
488
	{
489
	  state->__value.__wchb[0] = ch;
490
	  state->__count = 1;
491
	  if (n <= 1)
492
	    return -2;
493
	  ch = t[i++];
494
	}
495
    }
496
  if (state->__count == 1)
497
    {
498
      if (_iseucjp2 (ch))
499
	{
500
	  if (state->__value.__wchb[0] == 0x8f)
501
	    {
502
	      state->__value.__wchb[1] = ch;
503
	      state->__count = 2;
504
	      if (n <= i)
505
		return -2;
506
	      ch = t[i++];
507
	    }
508
	  else
509
	    {
510
	      *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch;
511
	      state->__count = 0;
512
	      return i;
513
	    }
514
	}
515
      else
516
	{
517
	  r->_errno = EILSEQ;
518
	  return -1;
519
	}
520
    }
521
  if (state->__count == 2)
522
    {
523
      if (_iseucjp2 (ch))
524
	{
525
	  *pwc = (((wchar_t)state->__value.__wchb[1]) << 8)
526
		 + (wchar_t)(ch & 0x7f);
527
	  state->__count = 0;
528
	  return i;
529
	}
530
      else
531
	{
532
	  r->_errno = EILSEQ;
533
	  return -1;
534
	}
535
    }
536
 
537
  *pwc = (wchar_t)*t;
538
 
539
  if (*t == '\0')
540
    return 0;
541
 
542
  return 1;
543
}
544
 
545
int
546
_DEFUN (__jis_mbtowc, (r, pwc, s, n, charset, state),
547
        struct _reent *r       _AND
548
        wchar_t       *pwc     _AND
549
        const char    *s       _AND
550
        size_t         n       _AND
551
	const char    *charset _AND
552
        mbstate_t      *state)
553
{
554
  wchar_t dummy;
555
  unsigned char *t = (unsigned char *)s;
556
  JIS_STATE curr_state;
557
  JIS_ACTION action;
558
  JIS_CHAR_TYPE ch;
559
  unsigned char *ptr;
560
  unsigned int i;
561
  int curr_ch;
562
 
563
  if (pwc == NULL)
564
    pwc = &dummy;
565
 
566
  if (s == NULL)
567
    {
568
      state->__state = ASCII;
569
      return 1;  /* state-dependent */
570
    }
571
 
572
  if (n == 0)
573
    return -2;
574
 
575
  curr_state = state->__state;
576
  ptr = t;
577
 
578
  for (i = 0; i < n; ++i)
579
    {
580
      curr_ch = t[i];
581
      switch (curr_ch)
582
	{
583
	case ESC_CHAR:
584
	  ch = ESCAPE;
585
	  break;
586
	case '$':
587
	  ch = DOLLAR;
588
	  break;
589
	case '@':
590
	  ch = AT;
591
	  break;
592
	case '(':
593
	  ch = BRACKET;
594
	  break;
595
	case 'B':
596
	  ch = B;
597
	  break;
598
	case 'J':
599
	  ch = J;
600
	  break;
601
	case '\0':
602
	  ch = NUL;
603
	  break;
604
	default:
605
	  if (_isjis (curr_ch))
606
	    ch = JIS_CHAR;
607
	  else
608
	    ch = OTHER;
609
	}
610
 
611
      action = JIS_action_table[curr_state][ch];
612
      curr_state = JIS_state_table[curr_state][ch];
613
 
614
      switch (action)
615
	{
616
	case NOOP:
617
	  break;
618
	case EMPTY:
619
	  state->__state = ASCII;
620
	  *pwc = (wchar_t)0;
621
	  return 0;
622
	case COPY_A:
623
	  state->__state = ASCII;
624
	  *pwc = (wchar_t)*ptr;
625
	  return (i + 1);
626
	case COPY_J1:
627
	  state->__value.__wchb[0] = t[i];
628
	  break;
629
	case COPY_J2:
630
	  state->__state = JIS;
631
	  *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)(t[i]);
632
	  return (i + 1);
633
	case MAKE_A:
634
	  ptr = (unsigned char *)(t + i + 1);
635
	  break;
636
	case ERROR:
637
	default:
638
	  r->_errno = EILSEQ;
639
	  return -1;
640
	}
641
 
642
    }
643
 
644
  state->__state = curr_state;
645
  return -2;  /* n < bytes needed */
646
}
647
#endif /* !__CYGWIN__*/
648
#endif /* _MB_CAPABLE */