mbchar.c 9.33 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
/* Multibyte Character Functions.
   Copyright (C) 1998 Free Software Foundation, Inc.

This file is part of GNU CC.

GNU CC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.

GNU CC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with GNU CC; see the file COPYING.  If not, write to
the Free Software Foundation, 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.  */

/* Note regarding cross compilation:

Jeff Law committed
23
   In general, translation of multibyte characters to wide characters can
24 25 26 27 28 29 30 31 32 33 34 35 36 37
   only work in a native compiler since the translation function (mbtowc)
   needs to know about both the source and target character encoding.  However,
   this particular implementation for JIS, SJIS and EUCJP source characters
   will work for any compiler with a newlib target.  Other targets may also
   work provided that their wchar_t implementation is 2 bytes and the encoding
   leaves the source character values unchanged (except for removing the
   state shifting markers).  */

#ifdef MULTIBYTE_CHARS
#include "config.h"
#include "system.h"
#include "mbchar.h"
#include <locale.h>

Jeff Law committed
38 39
typedef enum {ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER,
	      JIS_C_NUM} JIS_CHAR_TYPE;
40

Jeff Law committed
41 42 43 44 45 46 47 48 49 50 51 52
typedef enum {ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR,
	     J2_ESC, J2_ESC_BR, INV, JIS_S_NUM} JIS_STATE; 

typedef enum {COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP,
	      EMPTY, ERROR} JIS_ACTION;

/* State/action tables for processing JIS encoding:

   Where possible, switches to JIS are grouped with proceding JIS characters
   and switches to ASCII are grouped with preceding JIS characters.
   Thus, maximum returned length is:
     2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6.  */
53 54

static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = {
Jeff Law committed
55
/*            ESCAPE DOLLAR   BRACKET   AT     B      J     NUL JIS_CHAR OTH*/
56 57 58 59 60 61 62 63 64 65 66 67 68
/*ASCII*/   { A_ESC, ASCII,   ASCII,    ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
/*A_ESC*/   { ASCII, A_ESC_DL,ASCII,    ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
/*A_ESC_DL*/{ ASCII, ASCII,   ASCII,    JIS,   JIS,   ASCII, ASCII,ASCII,ASCII},
/*JIS*/     { J_ESC, JIS_1,   JIS_1,    JIS_1, JIS_1, JIS_1, INV,  JIS_1,INV },
/*JIS_1*/   { INV,   JIS_2,   JIS_2,    JIS_2, JIS_2, JIS_2, INV,  JIS_2,INV },
/*JIS_2*/   { J2_ESC,JIS,     JIS,      JIS,   JIS,   JIS,   INV,  JIS,  JIS },
/*J_ESC*/   { INV,   INV,     J_ESC_BR, INV,   INV,   INV,   INV,  INV,  INV },
/*J_ESC_BR*/{ INV,   INV,     INV,      INV,   ASCII, ASCII, INV,  INV,  INV },
/*J2_ESC*/  { INV,   INV,     J2_ESC_BR,INV,   INV,   INV,   INV,  INV,  INV },
/*J2_ESC_BR*/{INV,   INV,     INV,      INV,   ASCII, ASCII, INV,  INV,  INV },
};

static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = {
Jeff Law committed
69
/*            ESCAPE DOLLAR BRACKET AT     B       J      NUL  JIS_CHAR OTH */
70 71 72
/*ASCII */   {NOOP,  COPYA, COPYA, COPYA,  COPYA,  COPYA, EMPTY, COPYA, COPYA},
/*A_ESC */   {COPYA, NOOP,  COPYA, COPYA,  COPYA,  COPYA, COPYA, COPYA, COPYA},
/*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA},
Jeff Law committed
73 74
/*JIS */     {NOOP,  NOOP,  NOOP,  NOOP,   NOOP,   NOOP,  ERROR, NOOP,  ERROR},
/*JIS_1 */   {ERROR, NOOP,  NOOP,  NOOP,   NOOP,   NOOP,  ERROR, NOOP,  ERROR},
75
/*JIS_2 */   {NOOP,  COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2},
Jeff Law committed
76 77 78 79
/*J_ESC */   {ERROR, ERROR, NOOP,  ERROR,  ERROR,  ERROR, ERROR, ERROR, ERROR},
/*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR,  NOOP,   NOOP,  ERROR, ERROR, ERROR},
/*J2_ESC */  {ERROR, ERROR, NOOP,  ERROR,  ERROR,  ERROR, ERROR, ERROR, ERROR},
/*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR,  COPYJ,  COPYJ, ERROR, ERROR, ERROR},
80 81 82
};


83
const char *literal_codeset = NULL;
84

Jeff Law committed
85 86 87 88 89 90 91 92 93
/* Store into *PWC (if PWC is not null) the wide character
   corresponding to the multibyte character at the start of the
   buffer S of size N.  Return the number of bytes in the multibyte
   character.  Return -1 if the bytes do not form a valid character,
   or 0 if S is null or points to a null byte.

   This function behaves like the Standard C function mbtowc, except
   it treats locale names of the form "C-..." specially.  */

94 95
int
local_mbtowc (pwc, s, n)
Jeff Law committed
96
     wchar_t *pwc;
97
     const char *s;
Jeff Law committed
98
     size_t n;
99 100 101
{
  static JIS_STATE save_state = ASCII;
  JIS_STATE curr_state = save_state;
102
  const unsigned char *t = (const unsigned char *) s;
103 104 105 106 107

  if (s != NULL && n == 0)
    return -1;

  if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
Jeff Law committed
108 109
    /* This must be the "C" locale or unknown locale -- fall thru */
    ;
110 111 112 113
  else if (! strcmp (literal_codeset, "C-SJIS"))
    {
      int char1;
      if (s == NULL)
Jeff Law committed
114 115 116
	/* Not state-dependent.  */
        return 0;

117 118 119 120
      char1 = *t;
      if (ISSJIS1 (char1))
        {
          int char2 = t[1];
Jeff Law committed
121

122 123
          if (n <= 1)
            return -1;
Jeff Law committed
124

125 126 127
          if (ISSJIS2 (char2))
            {
	      if (pwc != NULL)
Jeff Law committed
128
		*pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
129 130
              return 2;
            }
Jeff Law committed
131

132 133
	  return -1;
        }
Jeff Law committed
134

135
      if (pwc != NULL)
Jeff Law committed
136 137
	*pwc = (wchar_t) *t;

138 139
      if (*t == '\0')
	return 0;
Jeff Law committed
140

141 142 143 144 145
      return 1;
    }
  else if (! strcmp (literal_codeset, "C-EUCJP"))
    {
      int char1;
Jeff Law committed
146

147
      if (s == NULL)
Jeff Law committed
148 149 150
	/* Not state-dependent.  */
        return 0;

151 152 153 154
      char1 = *t;
      if (ISEUCJP (char1))
        {
          int char2 = t[1];     
Jeff Law committed
155

156 157
          if (n <= 1)
            return -1;
Jeff Law committed
158

159 160 161
          if (ISEUCJP (char2))
            {
	      if (pwc != NULL)
Jeff Law committed
162
		*pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
163 164
              return 2;
            }
Jeff Law committed
165

166 167
	  return -1;
        }
Jeff Law committed
168

169
      if (pwc != NULL)
Jeff Law committed
170 171
	*pwc = (wchar_t) *t;

172 173
      if (*t == '\0')
	return 0;
Jeff Law committed
174

175 176 177 178 179 180
      return 1;
    }
  else if (! strcmp (literal_codeset, "C-JIS"))
    {
      JIS_ACTION action;
      JIS_CHAR_TYPE ch;
181 182
      const unsigned char *ptr;
      size_t i, curr_ch;
183 184 185 186
 
      if (s == NULL)
	{
	  save_state = ASCII;
Jeff Law committed
187 188
	  /* State-dependent. */
	  return 1;
189 190 191 192
	}

      ptr = t;

Jeff Law committed
193
      for (i = 0; i < n; i++)
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
        {
          curr_ch = t[i];
          switch (curr_ch)
            {
	    case JIS_ESC_CHAR:
              ch = ESCAPE;
              break;
	    case '$':
              ch = DOLLAR;
              break;
            case '@':
              ch = AT;
              break;
            case '(':
	      ch = BRACKET;
              break;
            case 'B':
              ch = B;
              break;
            case 'J':
              ch = J;
              break;
            case '\0':
              ch = NUL;
              break;
            default:
              if (ISJIS (curr_ch))
                ch = JIS_CHAR;
              else
                ch = OTHER;
	    }

          action = JIS_action_table[curr_state][ch];
          curr_state = JIS_state_table[curr_state][ch];
        
          switch (action)
            {
            case NOOP:
              break;
Jeff Law committed
233

234 235
            case EMPTY:
	      if (pwc != NULL)
Jeff Law committed
236 237
		*pwc = (wchar_t) 0;

238 239
	      save_state = curr_state;
              return i;
Jeff Law committed
240

241 242
            case COPYA:
	      if (pwc != NULL)
Jeff Law committed
243
		*pwc = (wchar_t) *ptr;
244
	      save_state = curr_state;
Jeff Law committed
245 246
              return i + 1;

247 248
            case COPYJ:
	      if (pwc != NULL)
Jeff Law committed
249 250
		*pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));

251
	      save_state = curr_state;
Jeff Law committed
252 253
              return i + 1;

254 255
            case COPYJ2:
	      if (pwc != NULL)
Jeff Law committed
256 257
		*pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));

258
	      save_state = curr_state;
Jeff Law committed
259 260
              return ptr - t + 2;

261 262
            case MAKE_A:
            case MAKE_J:
263
              ptr = (const unsigned char *) (t + i + 1);
264
              break;
Jeff Law committed
265

266 267 268 269 270 271
            case ERROR:
            default:
              return -1;
            }
        }

Jeff Law committed
272 273
      /* More than n bytes needed.  */
      return -1;  
274 275 276 277
    }
               
#ifdef CROSS_COMPILE
  if (s == NULL)
Jeff Law committed
278 279 280
    /* Not state-dependent.  */
    return 0;

281 282 283 284
  if (pwc != NULL)
    *pwc = *s;
  return 1;
#else
Jeff Law committed
285

286 287 288 289 290
  /* This must be the "C" locale or unknown locale. */
  return mbtowc (pwc, s, n);
#endif
}

Jeff Law committed
291 292 293 294 295 296 297
/* Return the number of bytes in the multibyte character at the start
   of the buffer S of size N.  Return -1 if the bytes do not form a
   valid character, or 0 if S is null or points to a null byte.

   This function behaves like the Standard C function mblen, except
   it treats locale names of the form "C-..." specially.  */

298 299
int
local_mblen (s, n)
300
     const char *s;
Jeff Law committed
301
     size_t n;
302 303 304 305
{
  return local_mbtowc (NULL, s, n);
}

Jeff Law committed
306 307 308 309 310
/* Return the maximum mumber of bytes in a multibyte character.

   This function returns the same value as the Standard C macro MB_CUR_MAX,
   except it treats locale names of the form "C-..." specially.  */

311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
int
local_mb_cur_max ()
{
  if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
    ;
  else if (! strcmp (literal_codeset, "C-SJIS"))
    return 2;
  else if (! strcmp (literal_codeset, "C-EUCJP"))
    return 2;
  else if (! strcmp (literal_codeset, "C-JIS"))
    return 8; /* 3 + 2 + 3 */

#ifdef CROSS_COMPILE
  return 1;
#else
326 327 328 329
  if (MB_CUR_MAX > 0)
    return MB_CUR_MAX;

  return 1; /* default */
330 331
#endif
}
332 333
#else  /* MULTIBYTE_CHARS */
extern int dummy;  /* silence 'ANSI C forbids an empty source file' warning */
334
#endif /* MULTIBYTE_CHARS */