unicode-decomp.pl: Move from chartables.pl...

2002-03-04 Eric Blake <ebb9@email.byu.edu> * scripts/unicode-decomp.pl: Move from chartables.pl, and remove the code for generating include/java-chartables.h. * scripts/unicode-blocks.pl: Move from scripts/blocks.pl, and merge with Classpath. * scripts/unicode-muncher.pl: Copy from Classpath. * scritps/MakeCharTables.java: New file. * gnu/gcj/convert/Blocks-3.txt: New file. * gnu/gcj/convert/UnicodeData-3.0.0.txt: New file. * gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html: New file. * gnu/java/lang/CharData.java: Copy from Classpath. * Makefile.am (ordinary_java_source_files): Add gnu/java/lang/CharData.java. * configure.in: Remove --enable-fast-character option. * java/lang/Character.java: Merge algorithms and Javadoc with Classpath. * java/lang/natCharacter.cc: Implement Unicode lookup table more efficiently. * include/java-chardecomp.h: Regenerate. * include/java-chartables.h: Regenerate. From-SVN: r50368

unicode-decomp.pl: Move from chartables.pl...
2002-03-04 Eric Blake <ebb9@email.byu.edu> * scripts/unicode-decomp.pl: Move from chartables.pl, and remove the code for generating include/java-chartables.h. * scripts/unicode-blocks.pl: Move from scripts/blocks.pl, and merge with Classpath. * scripts/unicode-muncher.pl: Copy from Classpath. * scritps/MakeCharTables.java: New file. * gnu/gcj/convert/Blocks-3.txt: New file. * gnu/gcj/convert/UnicodeData-3.0.0.txt: New file. * gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html: New file. * gnu/java/lang/CharData.java: Copy from Classpath. * Makefile.am (ordinary_java_source_files): Add gnu/java/lang/CharData.java. * configure.in: Remove --enable-fast-character option. * java/lang/Character.java: Merge algorithms and Javadoc with Classpath. * java/lang/natCharacter.cc: Implement Unicode lookup table more efficiently. * include/java-chardecomp.h: Regenerate. * include/java-chartables.h: Regenerate. From-SVN: r50368
1fa78272 · Eric Blake · Eric Blake · b87e4a4c · 1fa78272 · 1fa78272
Commit 1fa78272 authored Mar 06, 2002 by Eric Blake Committed by Eric Blake Mar 06, 2002
17 changed files
--- a/libjava/ChangeLog
+++ b/libjava/ChangeLog
+2002-03-04  Eric Blake  <ebb9@email.byu.edu>
+
+	* scripts/unicode-decomp.pl: Move from chartables.pl, and remove
+	the code for generating include/java-chartables.h.
+	* scripts/unicode-blocks.pl: Move from scripts/blocks.pl, and
+	merge with Classpath.
+	* scripts/unicode-muncher.pl: Copy from Classpath.
+	* scritps/MakeCharTables.java: New file.
+	* gnu/gcj/convert/Blocks-3.txt: New file.
+	* gnu/gcj/convert/UnicodeData-3.0.0.txt: New file.
+	* gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html: New file.
+	* gnu/java/lang/CharData.java: Copy from Classpath.
+	* Makefile.am (ordinary_java_source_files): Add
+	gnu/java/lang/CharData.java.
+	* configure.in: Remove --enable-fast-character option.
+	* java/lang/Character.java: Merge algorithms and Javadoc with
+	Classpath.
+	* java/lang/natCharacter.cc: Implement Unicode lookup table more
+	efficiently.
+	* include/java-chardecomp.h: Regenerate.
+	* include/java-chartables.h: Regenerate.
+
 2002-03-06  Bryce McKinlay  <bryce@waitaki.otago.ac.nz>

 	* java/awt/MediaTracker.java: Implemented.

--- a/libjava/Makefile.am
+++ b/libjava/Makefile.am
@@ -1288,6 +1288,7 @@ gnu/java/io/NullOutputStream.java \
 gnu/java/io/ObjectIdentityWrapper.java \
 gnu/java/lang/ArrayHelper.java \
 gnu/java/lang/ClassHelper.java \
+gnu/java/lang/CharData.java \
 gnu/java/lang/reflect/TypeSignature.java \
 gnu/java/locale/Calendar.java \
 gnu/java/locale/Calendar_de.java \

--- a/libjava/chartables.pl
+++ b/libjava/chartables.pl
--- a/libjava/configure.in
+++ b/libjava/configure.in
@@ -42,13 +42,6 @@ AC_SUBST(COMPPATH)
 dnl The -no-testsuite modules omit the test subdir.
 AM_CONDITIONAL(TESTSUBDIR, test -d $srcdir/testsuite)

-dnl See whether the user prefers size or speed for Character.
-dnl The default is size.
-AC_ARG_ENABLE(fast-character,
-[  --enable-fast-character prefer speed over size for Character],
-# Nothing
-, AC_DEFINE(COMPACT_CHARACTER))
-
 dnl Should the runtime set system properties by examining the 
 dnl environment variable GCJ_PROPERTIES?
 AC_ARG_ENABLE(getenv-properties,

--- a/libjava/gnu/gcj/convert/Blocks-3.txt
+++ b/libjava/gnu/gcj/convert/Blocks-3.txt
+# Start Code; End Code; Block Name
+0000; 007F; Basic Latin
+0080; 00FF; Latin-1 Supplement
+0100; 017F; Latin Extended-A
+0180; 024F; Latin Extended-B
+0250; 02AF; IPA Extensions
+02B0; 02FF; Spacing Modifier Letters
+0300; 036F; Combining Diacritical Marks
+0370; 03FF; Greek
+0400; 04FF; Cyrillic
+0530; 058F; Armenian
+0590; 05FF; Hebrew
+0600; 06FF; Arabic
+0700; 074F; Syriac  
+0780; 07BF; Thaana
+0900; 097F; Devanagari
+0980; 09FF; Bengali
+0A00; 0A7F; Gurmukhi
+0A80; 0AFF; Gujarati
+0B00; 0B7F; Oriya
+0B80; 0BFF; Tamil
+0C00; 0C7F; Telugu
+0C80; 0CFF; Kannada
+0D00; 0D7F; Malayalam
+0D80; 0DFF; Sinhala
+0E00; 0E7F; Thai
+0E80; 0EFF; Lao
+0F00; 0FFF; Tibetan
+1000; 109F; Myanmar 
+10A0; 10FF; Georgian
+1100; 11FF; Hangul Jamo
+1200; 137F; Ethiopic
+13A0; 13FF; Cherokee
+1400; 167F; Unified Canadian Aboriginal Syllabics
+1680; 169F; Ogham
+16A0; 16FF; Runic
+1780; 17FF; Khmer
+1800; 18AF; Mongolian
+1E00; 1EFF; Latin Extended Additional
+1F00; 1FFF; Greek Extended
+2000; 206F; General Punctuation
+2070; 209F; Superscripts and Subscripts
+20A0; 20CF; Currency Symbols
+20D0; 20FF; Combining Marks for Symbols
+2100; 214F; Letterlike Symbols
+2150; 218F; Number Forms
+2190; 21FF; Arrows
+2200; 22FF; Mathematical Operators
+2300; 23FF; Miscellaneous Technical
+2400; 243F; Control Pictures
+2440; 245F; Optical Character Recognition
+2460; 24FF; Enclosed Alphanumerics
+2500; 257F; Box Drawing
+2580; 259F; Block Elements
+25A0; 25FF; Geometric Shapes
+2600; 26FF; Miscellaneous Symbols
+2700; 27BF; Dingbats
+2800; 28FF; Braille Patterns
+2E80; 2EFF; CJK Radicals Supplement
+2F00; 2FDF; Kangxi Radicals
+2FF0; 2FFF; Ideographic Description Characters
+3000; 303F; CJK Symbols and Punctuation
+3040; 309F; Hiragana
+30A0; 30FF; Katakana
+3100; 312F; Bopomofo
+3130; 318F; Hangul Compatibility Jamo
+3190; 319F; Kanbun
+31A0; 31BF; Bopomofo Extended
+3200; 32FF; Enclosed CJK Letters and Months
+3300; 33FF; CJK Compatibility
+3400; 4DB5; CJK Unified Ideographs Extension A
+4E00; 9FFF; CJK Unified Ideographs
+A000; A48F; Yi Syllables
+A490; A4CF; Yi Radicals
+AC00; D7A3; Hangul Syllables
+D800; DB7F; High Surrogates
+DB80; DBFF; High Private Use Surrogates
+DC00; DFFF; Low Surrogates
+E000; F8FF; Private Use
+F900; FAFF; CJK Compatibility Ideographs
+FB00; FB4F; Alphabetic Presentation Forms
+FB50; FDFF; Arabic Presentation Forms-A
+FE20; FE2F; Combining Half Marks
+FE30; FE4F; CJK Compatibility Forms
+FE50; FE6F; Small Form Variants
+FE70; FEFE; Arabic Presentation Forms-B
+FEFF; FEFF; Specials
+FF00; FFEF; Halfwidth and Fullwidth Forms
+FFF0; FFFD; Specials
+
--- a/libjava/gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html
+++ b/libjava/gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
+
+       "http://www.w3.org/TR/REC-html40/loose.dtd"> 
+
+<html>
+
+
+
+<head>
+
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+
+<meta http-equiv="Content-Language" content="en-us">
+
+<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
+
+<meta name="ProgId" content="FrontPage.Editor.Document">
+
+<link rel="stylesheet" href="http://www.unicode.org/unicode.css" type="text/css">
+
+<title>Unicode Character Database</title>
+
+</head>
+
+
+
+<body>
+
+
+
+<h1>UNICODE CHARACTER DATABASE<br>  
+Version 3.0.0</h1>
+
+<table border="1" cellspacing="2" cellpadding="0" height="87" width="100%">
+
+  <tr>
+
+    <td valign="TOP" width="144">Revision</td>
+
+    <td valign="TOP">3.0.0</td>
+
+  </tr>
+
+  <tr>
+
+    <td valign="TOP" width="144">Authors</td>
+
+    <td valign="TOP">Mark Davis and Ken Whistler</td>
+
+  </tr>
+
+  <tr>
+
+    <td valign="TOP" width="144">Date</td>
+
+    <td valign="TOP">1999-09-11</td>
+
+  </tr>
+
+  <tr>
+
+    <td valign="TOP" width="144">This Version</td>
+
+    <td valign="TOP"><a href="ftp://ftp.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html">ftp://ftp.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html</a></td>
+
+  </tr>
+
+  <tr>
+
+    <td valign="TOP" width="144">Previous Version</td>
+
+    <td valign="TOP">n/a</td>
+
+  </tr>
+
+  <tr>
+
+    <td valign="TOP" width="144">Latest Version</td>
+
+    <td valign="TOP"><a href="ftp://ftp.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html">ftp://ftp.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html</a></td>
+
+  </tr>
+
+</table>
+
+<p align="center">Copyright © 1995-1999 Unicode, Inc. All Rights reserved.</p>  
+  
+<h2>Disclaimer</h2>  
+  
+<p>The Unicode Character Database is provided as is by Unicode, Inc. No claims   
+  
+are made as to fitness for any particular purpose. No warranties of any kind are   
+  
+expressed or implied. The recipient agrees to determine applicability of   
+  
+information provided. If this file has been purchased on magnetic or optical   
+  
+media from Unicode, Inc., the sole remedy for any claim will be exchange of   
+  
+defective media within 90 days of receipt.</p>  
+  
+<p>This disclaimer is applicable for all other data files accompanying the   
+  
+Unicode Character Database, some of which have been compiled by the Unicode   
+  
+Consortium, and some of which have been supplied by other sources.</p>  
+  
+<h2>Limitations on Rights to Redistribute This Data</h2>  
+  
+<p>Recipient is granted the right to make copies in any form for internal   
+  
+distribution and to freely use the information supplied in the creation of   
+  
+products supporting the Unicode<sup>TM</sup> Standard. The files in the Unicode   
+  
+Character Database can be redistributed to third parties or other organizations   
+  
+(whether for profit or not) as long as this notice and the disclaimer notice are   
+  
+retained. Information can be extracted from these files and used in   
+  
+documentation or programs, as long as there is an accompanying notice indicating   
+  
+the source.</p>  
+  
+<h2>Introduction</h2>  
+  
+<p>The Unicode Character Database is a set of files that define the Unicode   
+  
+character properties and internal mappings. For more information about character   
+  
+properties and mappings, see <i><a href="http://www.unicode.org/unicode/uni2book/u2.html">The   
+  
+Unicode Standard</a></i>.</p>  
+  
+<p>The Unicode Character Database has been updated to reflect Version 3.0 of the   
+  
+Unicode Standard, with many characters added to those published in Version 2.0.   
+  
+A number of corrections have also been made to case mappings or other errors in   
+  
+the database noted since the publication of Version 2.0. Normative bidirectional   
+  
+properties have also been modified to reflect decisions of the Unicode Technical   
+  
+Committee.</p>  
+  
+<p>For more information on versions of the Unicode Standard and how to reference   
+  
+them, see <a href="http://www.unicode.org/unicode/standard/versions/">http://www.unicode.org/unicode/standard/versions/</a>.</p>  
+  
+<h2>Conformance</h2>  
+  
+<p>Character properties may be either normative or informative. <i>Normative</i>   
+  
+means that implementations that claim conformance to the Unicode Standard (at a   
+  
+particular version) and which make use of a particular property or field must   
+  
+follow the specifications of the standard for that property or field in order to   
+  
+be conformant. The term <i>normative</i> when applied to a property or field of   
+  
+the Unicode Character Database, does <i>not</i> mean that the value of that   
+  
+field will never change. Corrections and extensions to the standard in the   
+  
+future may require minor changes to normative values, even though the Unicode   
+  
+Technical Committee strives to minimize such changes. An<i> informative </i>property   
+  
+or field is strongly recommended, but a conformant implementation is free to use   
+  
+or change such values as it may require while still being conformant to the   
+  
+standard. Particular implementations may choose to override the properties and   
+  
+mappings that are not normative. In that case, it is up to the implementer to   
+  
+establish a protocol to convey that information.</p>  
+  
+<h2>Files</h2>  
+  
+<p>The following summarizes the files in the Unicode Character Database. &nbsp;For   
+  
+more information about these files, see the referenced technical report or   
+  
+section of Unicode Standard, Version 3.0.</p>  
+  
+<p><b>UnicodeData.txt (Chapter 4)</b>  
+  
+<ul>  
+  
+  <li>The main file in the Unicode Character Database.</li>  
+  
+  <li>For detailed information on the format, see <a href="UnicodeData.html">UnicodeData.html</a>.   
+  
+    This file also characterizes which properties are normative and which are   
+  
+    informative.</li>  
+  
+</ul>  
+  
+<p><b>PropList.txt (Chapter 4)</b>  
+  
+<ul>  
+  
+  <li>Additional informative properties list: <i>Alphabetic, Ideographic,</i>   
+  
+    and <i>Mathematical</i>, among others.</li>  
+  
+</ul>  
+  
+<p><b>SpecialCasing.txt (Chapter 4)</b>  
+  
+<ul>  
+  
+  <li>List of informative special casing properties, including one-to-many   
+  
+    mappings such as SHARP S =&gt; &quot;SS&quot;, and locale-specific mappings,   
+  
+    such as for Turkish <i>dotless i</i>.</li>  
+  
+</ul>  
+  
+<p><b>Blocks.txt (Chapter 14)</b>  
+  
+<ul>  
+  
+  <li>List of normative block names.</li>  
+  
+</ul>  
+  
+<p><b>Jamo.txt (Chapter 4)</b>  
+  
+<ul>  
+  
+  <li>List of normative Jamo short names, used in deriving HANGUL SYLLABLE names   
+  
+    algorithmically.</li>  
+  
+</ul>  
+  
+<p><b>ArabicShaping.txt (Section 8.2)</b>  
+  
+<ul>  
+  
+  <li>Basic Arabic and Syriac character shaping properties, such as initial,   
+  
+    medial and final shapes. These properties are normative for minimal shaping   
+  
+    of Arabic and Syriac. </li>  
+  
+</ul>  
+  
+<p><b>NamesList.txt (Chapter 14)</b>  
+  
+<ul>  
+  
+  <li>This file duplicates some of the material in the UnicodeData file, and   
+  
+    adds informative annotations uses in the character charts, as printed in the   
+  
+    Unicode Standard. </li>  
+  
+  <li><b>Note: </b>The information in NamesList.txt and Index.txt files matches   
+  
+    the appropriate version of the book. Changes in the Unicode Character   
+  
+    Database since then may not be reflected in these files, since they are   
+  
+    primarily of archival interest.</li>  
+  
+</ul>  
+  
+<p><b>Index.txt (Chapter 14)</b>  
+  
+<ul>  
+  
+  <li>Informative index to Unicode characters, as printed in the Unicode   
+  
+    Standard</li>  
+  
+  <li><b>Note: </b>The information in NamesList.txt and Index.txt files matches   
+  
+    the appropriate version of the book. Changes in the Unicode Character   
+  
+    Database since then may not be reflected in these files, since they are   
+  
+    primarily of archival interest.</li>  
+  
+</ul>  
+  
+<p><b>CompositionExclusions.txt (<a href="http://www.unicode.org/unicode/reports/tr15/">UTR#15   
+  
+Unicode Normalization Forms</a>)</b>  
+  
+<ul>  
+  
+  <li>Normative properties for normalization.</li>  
+  
+</ul>  
+  
+<p><b>LineBreak.txt (<a href="http://www.unicode.org/unicode/reports/tr14/">UTR   
+  
+#14: Line Breaking Properties</a>)</b>  
+  
+<ul>  
+  
+  <li>Normative and informative properties for line breaking. To see which   
+  
+    properties are informative and which are normative, consult UTR#14.</li>  
+  
+</ul>  
+  
+<p><b>EastAsianWidth.txt (<a href="http://www.unicode.org/unicode/reports/tr11/">UTR   
+  
+#11: East Asian Character Width</a>)</b>  
+  
+<ul>  
+  
+  <li>Informative properties for determining the choice of wide vs. narrow   
+  
+    glyphs in East Asian contexts.</li>  
+  
+</ul>  
+  
+<p><b>diffXvY.txt</b>  
+  
+<ul>  
+  
+  <li>Mechanically-generated informative files containing accumulated   
+  
+    differences between successive versions of UnicodeData.txt</li>  
+  
+</ul>  
+  
+  
+  
+</body>  
+  
+  
+  
+</html>  
+  
--- a/libjava/gnu/gcj/convert/UnicodeData-3.0.0.txt
+++ b/libjava/gnu/gcj/convert/UnicodeData-3.0.0.txt
--- a/libjava/gnu/java/lang/CharData.java
+++ b/libjava/gnu/java/lang/CharData.java
--- a/libjava/include/java-chardecomp.h
+++ b/libjava/include/java-chardecomp.h
--- a/libjava/include/java-chartables.h
+++ b/libjava/include/java-chartables.h
--- a/libjava/java/lang/Character.java
+++ b/libjava/java/lang/Character.java
--- a/libjava/java/lang/natCharacter.cc
+++ b/libjava/java/lang/natCharacter.cc
-// natCharacter.cc - Native part of Character class.
-
-/* Copyright (C) 1998, 1999  Free Software Foundation
-
-   This file is part of libgcj.
-
-This software is copyrighted work licensed under the terms of the
-Libgcj License.  Please consult the file "LIBGCJ_LICENSE" for
-details.  */
+/* java.lang.Character -- Wrapper class for char, and Unicode subsets
+   Copyright (C) 1998, 1999, 2001, 2002 Free Software Foundation, Inc.
+
+This file is part of GNU Classpath.
+
+GNU Classpath is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU Classpath is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Classpath; see the file COPYING.  If not, write to the
+Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+02111-1307 USA.
+
+Linking this library statically or dynamically with other modules is
+making a combined work based on this library.  Thus, the terms and
+conditions of the GNU General Public License cover the whole
+combination.
+
+As a special exception, the copyright holders of this library give you
+permission to link this library with independent modules to produce an
+executable, regardless of the license terms of these independent
+modules, and to copy and distribute the resulting executable under
+terms of your choice, provided that you also meet, for each linked
+independent module, the terms and conditions of the license of that
+module.  An independent module is a module which is not derived from
+or based on this library.  If you modify this library, you may extend
+this exception to your version of the library, but you are not
+obligated to do so.  If you do not wish to do so, delete this
+exception statement from your version. */

 #include <config.h>

@@ -18,267 +45,69 @@ details.  */



-#define asize(x)  ((sizeof (x)) / sizeof (x[0]))
-
-static jchar
-to_lower_title (jchar ch)
-{
-  for (unsigned int i = 0; i < asize (title_to_upper_table); ++i)
-    {
-      // We can assume that the entries in the two tables are
-      // parallel.  This is checked in the script.
-      if (title_to_upper_table[i][1] == ch
-	  || title_to_upper_table[i][0] == ch)
-	return title_to_lower_table[i][1];
-    }
-  return ch;
-}
-
-static jchar
-to_upper_title (jchar ch)
-{
-  for (unsigned int i = 0; i < asize (title_to_lower_table); ++i)
-    {
-      // We can assume that the entries in the two tables are
-      // parallel.  This is checked in the script.
-      if (title_to_lower_table[i][1] == ch
-	  || title_to_lower_table[i][0] == ch)
-	return title_to_upper_table[i][1];
-    }
-  return ch;
-}
-
-jboolean
-java::lang::Character::isTitleCase (jchar ch)
-{
-  for (unsigned int i = 0; i < asize (title_to_lower_table); ++i)
-    {
-      if (title_to_lower_table[i][0] == ch)
-	return true;
-    }
-  return false;
-}
-
 jchar
-java::lang::Character::toTitleCase (jchar ch)
+java::lang::Character::readChar(jchar ch)
 {
-  // Both titlecase mapping tables have the same length.  This is
-  // checked in the chartables script.
-  for (unsigned int i = 0; i < asize (title_to_lower_table); ++i)
-    {
-      if (title_to_lower_table[i][0] == ch)
-	return ch;
-      if (title_to_lower_table[i][1] == ch)
-	return title_to_lower_table[i][0];
-      if (title_to_upper_table[i][1] == ch)
-	return title_to_upper_table[i][0];
-    }
-  return toUpperCase (ch);
-}
-
-#ifdef COMPACT_CHARACTER
-
-static int
-table_search (const jchar table[][2], int table_len, jchar ch)
-{
-  int low, high, i, old;
-
-  low = 0;
-  high = table_len;
-  i = high / 2;
-
-  while (true)
-    {
-      if (ch < table[i][0])
-	high = i;
-      else if (ch > table[i][1])
-	low = i;
-      else
-	return i;
-
-      old = i;
-      i = (high + low) / 2;
-      if (i == old)
-	break;
-    }
-
-  return -1;
-}
-
-jint
-java::lang::Character::digit_value (jchar ch)
-{
-  int index = table_search (digit_table, asize (digit_table), ch);
-  if (index == -1)
-    return -1;
-
-  jchar base = digit_table[index][0];
-  // Tamil doesn't have a digit `0'.  So we special-case it here.
-  if (base == TAMIL_DIGIT_ONE)
-    return ch - base + 1;
-  return ch - base;
-}
-
-jint
-java::lang::Character::getNumericValue (jchar ch)
-{
-  jint d = digit (ch, 36);
-  if (d != -1)
-    return d;
-
-  for (unsigned int i = 0; i < asize (numeric_table); ++i)
-    {
-      if (numeric_table[i] == ch)
-	return numeric_value[i];
-    }
-
-  return -1;
+  // Perform 16-bit addition to find the correct entry in data.
+  return data[(jchar) (blocks[ch >> SHIFT] + ch)];
 }

 jint
-java::lang::Character::getType (jchar ch)
+java::lang::Character::getType(jchar ch)
 {
-  int index = table_search (all_table, asize (all_table), ch);
-  if (index != -1)
-    return category_table[index];
-  return UNASSIGNED;
-}
-
-jboolean
-java::lang::Character::isLowerCase (jchar ch)
-{
-  if (ch >= 0x2000 && ch <= 0x2fff)
-    return false;
-  if (table_search (lower_case_table, asize (lower_case_table), ch) != -1)
-    return true;
-
-  int low, high, i, old;
-
-  low = 0;
-  high = asize (lower_anomalous_table);
-  i = high / 2;
-
-  while (true)
-    {
-      if (ch < lower_anomalous_table[i])
-	high = i;
-      else if (ch > lower_anomalous_table[i])
-	low = i;
-      else
-	return true;
-
-      old = i;
-      i = (high + low) / 2;
-      if (i == old)
-	break;
-    }
-
-  return false;
-}
-
-jboolean
-java::lang::Character::isSpaceChar (jchar ch)
-{
-  return table_search (space_table, asize (space_table), ch) != -1;
-}
-
-jboolean
-java::lang::Character::isUpperCase (jchar ch)
-{
-  if (ch >= 0x2000 && ch <= 0x2fff)
-    return false;
-  return table_search (upper_case_table, asize (upper_case_table), ch) != -1;
+  // Perform 16-bit addition to find the correct entry in data.
+  return (jint) (data[(jchar) (blocks[ch >> SHIFT] + ch)] & TYPE_MASK);
 }

 jchar
-java::lang::Character::toLowerCase (jchar ch)
+java::lang::Character::toLowerCase(jchar ch)
 {
-  int index = table_search (upper_case_table, asize (upper_case_table), ch);
-  if (index == -1)
-    return to_lower_title (ch);
-  return (jchar) (ch - upper_case_table[index][0]
-		  + upper_case_map_table[index]);
+  return (jchar) (ch + lower[readChar(ch) >> 7]);
 }

 jchar
-java::lang::Character::toUpperCase (jchar ch)
+java::lang::Character::toUpperCase(jchar ch)
 {
-  int index = table_search (lower_case_table, asize (lower_case_table), ch);
-  if (index == -1)
-    return to_upper_title (ch);
-  return (jchar) (ch - lower_case_table[index][0]
-		  + lower_case_map_table[index]);
+  return (jchar) (ch + upper[readChar(ch) >> 7]);
 }

-#else /* COMPACT_CHARACTER */
-
-jint
-java::lang::Character::digit_value (jchar ch)
+jchar
+java::lang::Character::toTitleCase(jchar ch)
 {
-  if (type_table[ch] == DECIMAL_DIGIT_NUMBER)
-    return attribute_table[ch];
-  return -1;
+  // As title is short, it doesn't hurt to exhaustively iterate over it.
+  for (int i = title_length - 2; i >= 0; i -= 2)
+    if (title[i] == ch)
+      return title[i + 1];
+  return toUpperCase(ch);
 }

 jint
-java::lang::Character::getNumericValue (jchar ch)
-{
-  jint d = digit (ch, 36);
-  if (d != -1)
-    return d;
-
-  // Some characters require two attributes.  We special-case them here.
-  if (ch >= ROMAN_START && ch <= ROMAN_END)
-    return secondary_attribute_table[ch - ROMAN_START];
-  if (type_table[ch] == LETTER_NUMBER || type_table[ch] == OTHER_NUMBER)
-    return attribute_table[ch];
-  return -1;
+java::lang::Character::digit(jchar ch, jint radix)
+{
+  if (radix < MIN_RADIX || radix > MAX_RADIX)
+    return (jint) -1;
+  jchar attr = readChar(ch);
+  if (((1 << (attr & TYPE_MASK))
+       & ((1 << UPPERCASE_LETTER)
+          | (1 << LOWERCASE_LETTER)
+          | (1 << DECIMAL_DIGIT_NUMBER))))
+    {
+      // Signedness doesn't matter; 0xffff vs. -1 are both rejected.
+      jint digit = (jint) numValue[attr >> 7];
+      return (digit >= 0 && digit < radix) ? digit : (jint) -1;
+    }
+  return (jint) -1;
 }

 jint
-java::lang::Character::getType (jchar ch)
-{
-  return type_table[ch];
-}
-
-jboolean
-java::lang::Character::isLowerCase (jchar ch)
-{
-  if (ch >= 0x2000 && ch <= 0x2fff)
-    return false;
-  return type_table[ch] == LOWERCASE_LETTER;
-}
-
-jboolean
-java::lang::Character::isSpaceChar (jchar ch)
-{
-  return (type_table[ch] == SPACE_SEPARATOR
-	  || type_table[ch] == LINE_SEPARATOR
-	  || type_table[ch] == PARAGRAPH_SEPARATOR);
-}
-
-jboolean
-java::lang::Character::isUpperCase (jchar ch)
-{
-  if (ch >= 0x2000 && ch <= 0x2fff)
-    return false;
-  return type_table[ch] == UPPERCASE_LETTER;
-}
-
-jchar
-java::lang::Character::toLowerCase (jchar ch)
+java::lang::Character::getNumericValue(jchar ch)
 {
-  if (type_table[ch] == UPPERCASE_LETTER)
-    return attribute_table[ch];
-  return to_lower_title (ch);
+  // numValue is stored as an array of jshort, since 10000 is the maximum.
+  return (jint) numValue[readChar(ch) >> 7];
 }

-jchar
-java::lang::Character::toUpperCase (jchar ch)
+jbyte
+java::lang::Character::getDirectionality(jchar ch)
 {
-  if (type_table[ch] == LOWERCASE_LETTER)
-    return attribute_table[ch];
-  return to_upper_title (ch);
+  return direction[readChar(ch) >> 7];
 }
-
-#endif /* COMPACT_CHARACTER */
--- a/libjava/scripts/MakeCharTables.java
+++ b/libjava/scripts/MakeCharTables.java
--- a/libjava/scripts/blocks.pl
+++ b/libjava/scripts/blocks.pl
-#! /usr/bin/perl
-
-if ($ARGV[0] eq '')
-{
-    $file = 'Blocks.txt';
-    if (! -f $file)
-    {
-	# Too painful to figure out how to get Perl to do it.
-	system 'wget -o .wget-log http://www.unicode.org/Public/UNIDATA/Blocks.txt';
-    }
-}
-else
-{
-    $file = $ARGV[0];
-}
-
-open (INPUT, "< $file") || die "couldn't open $file: $!";
-
-@array = ();
-while (<INPUT>)
-{
-    next if /^#/;
-    chop;
-    next if /^$/;
-
-    ($start, $to, $text) = split (/; /);
-    ($symbol = $text) =~ tr/a-z/A-Z/;
-    $symbol =~ s/[- ]/_/g;
-
-    # Special case for one of the SPECIALS.
-    next if $start eq 'FEFF';
-
-    # Special case some areas that our heuristic mishandles.
-    if ($symbol eq 'HIGH_SURROGATES')
-    {
-	$symbol = 'SURROGATES_AREA';
-	$text = 'Surrogates Area';
-	$to = 'DFFF';
-    }
-    elsif ($symbol =~ /SURROGATES/)
-    {
-	next;
-    }
-    elsif ($symbol eq 'PRIVATE_USE')
-    {
-	$symbol .= '_AREA';
-	$text = 'Private Use Area';
-    }
-
-    printf "    public static final UnicodeBlock %s = new UnicodeBlock (\"%s\", '\\u%s', '\\u%s');\n",
-           $symbol, $text, $start, $to;
-
-    push (@array, $symbol);
-}
-
-printf "    private static final UnicodeBlock[] blocks = {\n";
-foreach (@array)
-{
-    printf "      %s", $_;
-    printf "," unless $_ eq 'SPECIALS';
-    printf "\n";
-}
-printf "    };\n";
-
-close (INPUT);
--- a/libjava/scripts/unicode-blocks.pl
+++ b/libjava/scripts/unicode-blocks.pl
+#!/usr/bin/perl -w
+# unicode-blocks.pl -- Script to generate java.lang.Character.UnicodeBlock
+# Copyright (C) 2002 Free Software Foundation, Inc.
+#
+# This file is part of GNU Classpath.
+#
+# GNU Classpath is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# GNU Classpath is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Classpath; see the file COPYING.  If not, write to the
+# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+# 02111-1307 USA.
+#
+# Linking this library statically or dynamically with other modules is
+# making a combined work based on this library.  Thus, the terms and
+# conditions of the GNU General Public License cover the whole
+# combination.
+#
+# As a special exception, the copyright holders of this library give you
+# permission to link this library with independent modules to produce an
+# executable, regardless of the license terms of these independent
+# modules, and to copy and distribute the resulting executable under
+# terms of your choice, provided that you also meet, for each linked
+# independent module, the terms and conditions of the license of that
+# module.  An independent module is a module which is not derived from
+# or based on this library.  If you modify this library, you may extend
+# this exception to your version of the library, but you are not
+# obligated to do so.  If you do not wish to do so, delete this
+# exception statement from your version.
+
+
+# Code for reading Blocks.txt and generating (to standard out) the code for
+# java.lang.Character.UnicodeBlock, for pasting into java/lang/Character.java.
+# You should probably check that the results are accurate to the
+# specification, but I made sure it works OOB for Unicode 3.0.0 and JDK 1.4.
+# As the grammar for the Blocks.txt file is changing in Unicode 3.2.0, you
+# will have to tweak this some for future use.  For now, the relevant
+# Unicode definition files are found in libjava/gnu/gcj/convert/.
+#
+# author Eric Blake <ebb9@email.byu.edu>
+#
+# usage: unicode-blocks.pl <blocks.txt>
+#    where <blocks.txt> is obtained from www.unicode.org (named Blocks-3.txt
+#    for Unicode version 3.0.0).
+
+
+die "Usage: $0 <blocks.txt>" unless @ARGV == 1;
+open (BLOCKS, $ARGV[0]) || die "Can't open Unicode block file: $!\n";
+
+# A hash of added fields and the JDK they were added in, to automatically
+# print @since tags.  Maintaining this is optional (and tedious), but nice.
+my %additions = ("SYRIAC" => "1.4",
+                 "THAANA" => "1.4",
+                 "SINHALA" => "1.4",
+                 "MYANMAR" => "1.4",
+                 "ETHIOPIC" => "1.4",
+                 "CHEROKEE" => "1.4",
+                 "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS" => "1.4",
+                 "OGHAM" => "1.4",
+                 "RUNIC" => "1.4",
+                 "KHMER" => "1.4",
+                 "MONGOLIAN" => "1.4",
+                 "BRAILLE_PATTERNS" => "1.4",
+                 "CJK_RADICALS_SUPPLEMENT" => "1.4",
+                 "KANGXI_RADICALS" => "1.4",
+                 "IDEOGRAPHIC_DESCRIPTION_CHARACTERS" => "1.4",
+                 "BOPOMOFO_EXTENDED" => "1.4",
+                 "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A" => "1.4",
+                 "YI_SYLLABLES" => "1.4",
+                 "YI_RADICALS" => "1.4",
+                 );
+
+print <<'EOF';
+  /**
+   * A family of character subsets in the Unicode specification. A character
+   * is in at most one of these blocks.
+   *
+   * This inner class was generated automatically from
+   * <code>$ARGV[0]</code>, by some perl scripts.
+   * This Unicode definition file can be found on the
+   * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
+   * JDK 1.4 uses Unicode version 3.0.0.
+   *
+   * @author scripts/unicode-blocks.pl (written by Eric Blake)
+   * @since 1.2
+   */
+  public static final class UnicodeBlock extends Subset
+  {
+    /** The start of the subset. */
+    private final char start;
+
+    /** The end of the subset. */
+    private final char end;
+
+    /**
+     * Constructor for strictly defined blocks.
+     *
+     * @param start the start character of the range
+     * @param end the end character of the range
+     * @param name the block name
+     */
+    private UnicodeBlock(char start, char end, String name)
+    {
+      super(name);
+      this.start = start;
+      this.end = end;
+    }
+
+    /**
+     * Returns the Unicode character block which a character belongs to.
+     *
+     * @param ch the character to look up
+     * @return the set it belongs to, or null if it is not in one
+     */
+    public static UnicodeBlock of(char ch)
+    {
+      // Special case, since SPECIALS contains two ranges.
+      if (ch == '\uFEFF')
+        return SPECIALS;
+      // Simple binary search for the correct block.
+      int low = 0;
+      int hi = sets.length - 1;
+      while (low <= hi)
+        {
+          int mid = (low + hi) >> 1;
+          UnicodeBlock b = sets[mid];
+          if (ch < b.start)
+            hi = mid - 1;
+          else if (ch > b.end)
+            low = mid + 1;
+          else
+            return b;
+        }
+      return null;
+    }
+EOF
+
+my $seenSpecials = 0;
+my $seenSurrogates = 0;
+my $surrogateStart = 0;
+my @names = ();
+while (<BLOCKS>) {
+    next if /^\#/;
+    my ($start, $end, $block) = split(/; /);
+    next unless defined $block;
+    chomp $block;
+    $block =~ s/ *$//;
+    if (! $seenSpecials and $block =~ /Specials/) {
+        # Special case SPECIALS, since it is two disjoint ranges
+        $seenSpecials = 1;
+        next;              
+    }
+    if ($block =~ /Surrogates/) {
+        # Special case SURROGATES_AREA, since it one range, not three
+        # consecutive, in Java
+        $seenSurrogates++;
+        if ($seenSurrogates == 1) {
+            $surrogateStart = $start;
+            next;
+        } elsif ($seenSurrogates == 2) {
+            next;
+        } else {
+            $start = $surrogateStart;
+            $block = "Surrogates Area";
+        }
+    }
+    # Special case the name of PRIVATE_USE_AREA.
+    $block =~ s/(Private Use)/$1 Area/;
+
+    (my $name = $block) =~ tr/a-z -/A-Z__/;
+    push @names, $name;
+    my $since = (defined $additions{$name}
+                 ? "\n     * \@since $additions{$name}" : "");
+    my $extra = ($block =~ /Specials/ ? "'\\uFEFF', " : "");
+    print <<EOF;
+
+    /**
+     * $block.
+     * $extra'\\u$start' - '\\u$end'.$since
+     */
+    public final static UnicodeBlock $name
+      = new UnicodeBlock('\\u$start', '\\u$end',
+                         "$name");
+EOF
+}
+
+print <<EOF;
+
+    /**
+     * The defined subsets.
+     */
+    private static final UnicodeBlock sets[] = {
+EOF
+
+foreach (@names) {
+    print "      $_,\n";
+}
+
+print <<EOF;
+    };
+  } // class UnicodeBlock
+EOF
--- a/libjava/scripts/unicode-decomp.pl
+++ b/libjava/scripts/unicode-decomp.pl
+#!/usr/bin/perl -w
+# unicode-decomp.pl - script to generate database for java.text.Collator
+# Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
+#
+# This file is part of libjava.
+# 
+# This software is copyrighted work licensed under the terms of the
+# Libjava License.  Please consult the file "LIBJAVA_LICENSE" for
+# details.
+
+# Code for reading UnicodeData.txt and generating the code for
+# gnu.java.lang.CharData.  For now, the relevant Unicode definition files
+# are found in libjava/gnu/gcj/convert/.
+#
+# Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
+#   where <UnicodeData.txt> is obtained from www.unicode.org (named
+#   UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
+#   is the final location of include/java-chardecomp.h.
+#   As of JDK 1.4, use Unicode version 3.0.0 for best results.
+#
+# If this exits with nonzero status, then you must investigate the
+# cause of the problem.
+# Diagnostics and other information to stderr.
+# With -n, the files are not created, but all processing still occurs.
+
+# These maps characters to their decompositions.
+my %canonical_decomposition = ();
+my %full_decomposition = ();
+
+# Handle `-n' and open output files.
+if ($ARGV[0] && $ARGV[0] eq '-n')
+{
+    shift @ARGV;
+    $ARGV[1] = '/dev/null';
+}
+die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
+open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
+
+# Process the Unicode file.
+$| = 1;
+my $count = 0;
+print STDERR "Parsing attributes file";
+while (<UNICODE>)
+{
+    print STDERR "." unless $count++ % 1000;
+    chomp;
+    s/\r//g;
+    my ($ch, undef, undef, undef, undef, $decomp) = split ';';
+    $ch = hex($ch);
+
+    if ($decomp ne '')
+    {
+        my $is_full = 0;
+        my @decomp = ();
+        foreach (split (' ', $decomp))
+        {
+            if (/^\<.*\>$/)
+            {
+                $is_full = 1;
+                next;
+            }
+	    push (@decomp, hex ($_));
+	}
+        my $s = pack "n*", @decomp;
+        if ($is_full)
+        {
+            $full_decomposition{$ch} = $s;
+        }
+        else
+        {
+            $canonical_decomposition{$ch} = $s;
+        }
+    }
+}
+
+# Now generate decomposition tables.
+open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
+print STDERR "\nGenerating tables\n";
+print DECOMP <<EOF;
+// java-chardecomp.h - Decomposition character tables -*- c++ -*-
+
+#ifndef __JAVA_CHARDECOMP_H__
+#define __JAVA_CHARDECOMP_H__
+
+
+// These tables are automatically generated by the $0
+// script.  DO NOT EDIT the tables.  Instead, fix the script
+// and run it again.
+
+// This file should only be included by natCollator.cc
+
+struct decomp_entry
+{
+  jchar key;
+  const char *value;
+};
+
+EOF
+
+&write_decompositions;
+
+print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
+
+close(DECOMP);
+print STDERR "Done\n";
+exit;
+
+
+# Write a single decomposition table.
+sub write_single_decomposition($$%)
+{
+    my ($name, $is_canon, %table) = @_;
+    my $first_line = 1;
+    print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
+
+    for my $key (0 .. 0xffff)
+    {
+	next if ! defined $table{$key};
+        print DECOMP ",\n" unless $first_line;
+	$first_line = 0;
+
+	printf DECOMP "  { 0x%04x, \"", $key;
+
+	# We represent the expansion as a series of bytes, terminated
+	# with a double nul.  This is ugly, but relatively
+	# space-efficient.  Most expansions are short, but there are a
+	# few that are very long (e.g. \uFDFA).  This means that if we
+	# chose a fixed-space representation we would waste a lot of
+	# space.
+	my @expansion = unpack "n*", $table{$key};
+	foreach my $char (@expansion)
+	{
+	    printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
+	}
+
+	print DECOMP "\" }";
+    }
+
+    print DECOMP "\n};\n\n";
+}
+
+sub write_decompositions()
+{
+    &write_single_decomposition ('canonical', 1, %canonical_decomposition);
+    &write_single_decomposition ('full', 0, %full_decomposition);
+}
--- a/libjava/scripts/unicode-muncher.pl
+++ b/libjava/scripts/unicode-muncher.pl