Commit 1fa78272 by Eric Blake Committed by Eric Blake

unicode-decomp.pl: Move from chartables.pl...

2002-03-04  Eric Blake  <ebb9@email.byu.edu>

	* scripts/unicode-decomp.pl: Move from chartables.pl, and remove
	the code for generating include/java-chartables.h.
	* scripts/unicode-blocks.pl: Move from scripts/blocks.pl, and
	merge with Classpath.
	* scripts/unicode-muncher.pl: Copy from Classpath.
	* scritps/MakeCharTables.java: New file.
	* gnu/gcj/convert/Blocks-3.txt: New file.
	* gnu/gcj/convert/UnicodeData-3.0.0.txt: New file.
	* gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html: New file.
	* gnu/java/lang/CharData.java: Copy from Classpath.
	* Makefile.am (ordinary_java_source_files): Add
	gnu/java/lang/CharData.java.
	* configure.in: Remove --enable-fast-character option.
	* java/lang/Character.java: Merge algorithms and Javadoc with
	Classpath.
	* java/lang/natCharacter.cc: Implement Unicode lookup table more
	efficiently.
	* include/java-chardecomp.h: Regenerate.
	* include/java-chartables.h: Regenerate.

From-SVN: r50368
parent b87e4a4c
2002-03-04 Eric Blake <ebb9@email.byu.edu>
* scripts/unicode-decomp.pl: Move from chartables.pl, and remove
the code for generating include/java-chartables.h.
* scripts/unicode-blocks.pl: Move from scripts/blocks.pl, and
merge with Classpath.
* scripts/unicode-muncher.pl: Copy from Classpath.
* scritps/MakeCharTables.java: New file.
* gnu/gcj/convert/Blocks-3.txt: New file.
* gnu/gcj/convert/UnicodeData-3.0.0.txt: New file.
* gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html: New file.
* gnu/java/lang/CharData.java: Copy from Classpath.
* Makefile.am (ordinary_java_source_files): Add
gnu/java/lang/CharData.java.
* configure.in: Remove --enable-fast-character option.
* java/lang/Character.java: Merge algorithms and Javadoc with
Classpath.
* java/lang/natCharacter.cc: Implement Unicode lookup table more
efficiently.
* include/java-chardecomp.h: Regenerate.
* include/java-chartables.h: Regenerate.
2002-03-06 Bryce McKinlay <bryce@waitaki.otago.ac.nz>
* java/awt/MediaTracker.java: Implemented.
......
......@@ -1288,6 +1288,7 @@ gnu/java/io/NullOutputStream.java \
gnu/java/io/ObjectIdentityWrapper.java \
gnu/java/lang/ArrayHelper.java \
gnu/java/lang/ClassHelper.java \
gnu/java/lang/CharData.java \
gnu/java/lang/reflect/TypeSignature.java \
gnu/java/locale/Calendar.java \
gnu/java/locale/Calendar_de.java \
......
......@@ -42,13 +42,6 @@ AC_SUBST(COMPPATH)
dnl The -no-testsuite modules omit the test subdir.
AM_CONDITIONAL(TESTSUBDIR, test -d $srcdir/testsuite)
dnl See whether the user prefers size or speed for Character.
dnl The default is size.
AC_ARG_ENABLE(fast-character,
[ --enable-fast-character prefer speed over size for Character],
# Nothing
, AC_DEFINE(COMPACT_CHARACTER))
dnl Should the runtime set system properties by examining the
dnl environment variable GCJ_PROPERTIES?
AC_ARG_ENABLE(getenv-properties,
......
# Start Code; End Code; Block Name
0000; 007F; Basic Latin
0080; 00FF; Latin-1 Supplement
0100; 017F; Latin Extended-A
0180; 024F; Latin Extended-B
0250; 02AF; IPA Extensions
02B0; 02FF; Spacing Modifier Letters
0300; 036F; Combining Diacritical Marks
0370; 03FF; Greek
0400; 04FF; Cyrillic
0530; 058F; Armenian
0590; 05FF; Hebrew
0600; 06FF; Arabic
0700; 074F; Syriac
0780; 07BF; Thaana
0900; 097F; Devanagari
0980; 09FF; Bengali
0A00; 0A7F; Gurmukhi
0A80; 0AFF; Gujarati
0B00; 0B7F; Oriya
0B80; 0BFF; Tamil
0C00; 0C7F; Telugu
0C80; 0CFF; Kannada
0D00; 0D7F; Malayalam
0D80; 0DFF; Sinhala
0E00; 0E7F; Thai
0E80; 0EFF; Lao
0F00; 0FFF; Tibetan
1000; 109F; Myanmar
10A0; 10FF; Georgian
1100; 11FF; Hangul Jamo
1200; 137F; Ethiopic
13A0; 13FF; Cherokee
1400; 167F; Unified Canadian Aboriginal Syllabics
1680; 169F; Ogham
16A0; 16FF; Runic
1780; 17FF; Khmer
1800; 18AF; Mongolian
1E00; 1EFF; Latin Extended Additional
1F00; 1FFF; Greek Extended
2000; 206F; General Punctuation
2070; 209F; Superscripts and Subscripts
20A0; 20CF; Currency Symbols
20D0; 20FF; Combining Marks for Symbols
2100; 214F; Letterlike Symbols
2150; 218F; Number Forms
2190; 21FF; Arrows
2200; 22FF; Mathematical Operators
2300; 23FF; Miscellaneous Technical
2400; 243F; Control Pictures
2440; 245F; Optical Character Recognition
2460; 24FF; Enclosed Alphanumerics
2500; 257F; Box Drawing
2580; 259F; Block Elements
25A0; 25FF; Geometric Shapes
2600; 26FF; Miscellaneous Symbols
2700; 27BF; Dingbats
2800; 28FF; Braille Patterns
2E80; 2EFF; CJK Radicals Supplement
2F00; 2FDF; Kangxi Radicals
2FF0; 2FFF; Ideographic Description Characters
3000; 303F; CJK Symbols and Punctuation
3040; 309F; Hiragana
30A0; 30FF; Katakana
3100; 312F; Bopomofo
3130; 318F; Hangul Compatibility Jamo
3190; 319F; Kanbun
31A0; 31BF; Bopomofo Extended
3200; 32FF; Enclosed CJK Letters and Months
3300; 33FF; CJK Compatibility
3400; 4DB5; CJK Unified Ideographs Extension A
4E00; 9FFF; CJK Unified Ideographs
A000; A48F; Yi Syllables
A490; A4CF; Yi Radicals
AC00; D7A3; Hangul Syllables
D800; DB7F; High Surrogates
DB80; DBFF; High Private Use Surrogates
DC00; DFFF; Low Surrogates
E000; F8FF; Private Use
F900; FAFF; CJK Compatibility Ideographs
FB00; FB4F; Alphabetic Presentation Forms
FB50; FDFF; Arabic Presentation Forms-A
FE20; FE2F; Combining Half Marks
FE30; FE4F; CJK Compatibility Forms
FE50; FE6F; Small Form Variants
FE70; FEFE; Arabic Presentation Forms-B
FEFF; FEFF; Specials
FF00; FFEF; Halfwidth and Fullwidth Forms
FFF0; FFFD; Specials
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<link rel="stylesheet" href="http://www.unicode.org/unicode.css" type="text/css">
<title>Unicode Character Database</title>
</head>
<body>
<h1>UNICODE CHARACTER DATABASE<br>
Version 3.0.0</h1>
<table border="1" cellspacing="2" cellpadding="0" height="87" width="100%">
<tr>
<td valign="TOP" width="144">Revision</td>
<td valign="TOP">3.0.0</td>
</tr>
<tr>
<td valign="TOP" width="144">Authors</td>
<td valign="TOP">Mark Davis and Ken Whistler</td>
</tr>
<tr>
<td valign="TOP" width="144">Date</td>
<td valign="TOP">1999-09-11</td>
</tr>
<tr>
<td valign="TOP" width="144">This Version</td>
<td valign="TOP"><a href="ftp://ftp.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html">ftp://ftp.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html</a></td>
</tr>
<tr>
<td valign="TOP" width="144">Previous Version</td>
<td valign="TOP">n/a</td>
</tr>
<tr>
<td valign="TOP" width="144">Latest Version</td>
<td valign="TOP"><a href="ftp://ftp.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html">ftp://ftp.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html</a></td>
</tr>
</table>
<p align="center">Copyright © 1995-1999 Unicode, Inc. All Rights reserved.</p>
<h2>Disclaimer</h2>
<p>The Unicode Character Database is provided as is by Unicode, Inc. No claims
are made as to fitness for any particular purpose. No warranties of any kind are
expressed or implied. The recipient agrees to determine applicability of
information provided. If this file has been purchased on magnetic or optical
media from Unicode, Inc., the sole remedy for any claim will be exchange of
defective media within 90 days of receipt.</p>
<p>This disclaimer is applicable for all other data files accompanying the
Unicode Character Database, some of which have been compiled by the Unicode
Consortium, and some of which have been supplied by other sources.</p>
<h2>Limitations on Rights to Redistribute This Data</h2>
<p>Recipient is granted the right to make copies in any form for internal
distribution and to freely use the information supplied in the creation of
products supporting the Unicode<sup>TM</sup> Standard. The files in the Unicode
Character Database can be redistributed to third parties or other organizations
(whether for profit or not) as long as this notice and the disclaimer notice are
retained. Information can be extracted from these files and used in
documentation or programs, as long as there is an accompanying notice indicating
the source.</p>
<h2>Introduction</h2>
<p>The Unicode Character Database is a set of files that define the Unicode
character properties and internal mappings. For more information about character
properties and mappings, see <i><a href="http://www.unicode.org/unicode/uni2book/u2.html">The
Unicode Standard</a></i>.</p>
<p>The Unicode Character Database has been updated to reflect Version 3.0 of the
Unicode Standard, with many characters added to those published in Version 2.0.
A number of corrections have also been made to case mappings or other errors in
the database noted since the publication of Version 2.0. Normative bidirectional
properties have also been modified to reflect decisions of the Unicode Technical
Committee.</p>
<p>For more information on versions of the Unicode Standard and how to reference
them, see <a href="http://www.unicode.org/unicode/standard/versions/">http://www.unicode.org/unicode/standard/versions/</a>.</p>
<h2>Conformance</h2>
<p>Character properties may be either normative or informative. <i>Normative</i>
means that implementations that claim conformance to the Unicode Standard (at a
particular version) and which make use of a particular property or field must
follow the specifications of the standard for that property or field in order to
be conformant. The term <i>normative</i> when applied to a property or field of
the Unicode Character Database, does <i>not</i> mean that the value of that
field will never change. Corrections and extensions to the standard in the
future may require minor changes to normative values, even though the Unicode
Technical Committee strives to minimize such changes. An<i> informative </i>property
or field is strongly recommended, but a conformant implementation is free to use
or change such values as it may require while still being conformant to the
standard. Particular implementations may choose to override the properties and
mappings that are not normative. In that case, it is up to the implementer to
establish a protocol to convey that information.</p>
<h2>Files</h2>
<p>The following summarizes the files in the Unicode Character Database. &nbsp;For
more information about these files, see the referenced technical report or
section of Unicode Standard, Version 3.0.</p>
<p><b>UnicodeData.txt (Chapter 4)</b>
<ul>
<li>The main file in the Unicode Character Database.</li>
<li>For detailed information on the format, see <a href="UnicodeData.html">UnicodeData.html</a>.
This file also characterizes which properties are normative and which are
informative.</li>
</ul>
<p><b>PropList.txt (Chapter 4)</b>
<ul>
<li>Additional informative properties list: <i>Alphabetic, Ideographic,</i>
and <i>Mathematical</i>, among others.</li>
</ul>
<p><b>SpecialCasing.txt (Chapter 4)</b>
<ul>
<li>List of informative special casing properties, including one-to-many
mappings such as SHARP S =&gt; &quot;SS&quot;, and locale-specific mappings,
such as for Turkish <i>dotless i</i>.</li>
</ul>
<p><b>Blocks.txt (Chapter 14)</b>
<ul>
<li>List of normative block names.</li>
</ul>
<p><b>Jamo.txt (Chapter 4)</b>
<ul>
<li>List of normative Jamo short names, used in deriving HANGUL SYLLABLE names
algorithmically.</li>
</ul>
<p><b>ArabicShaping.txt (Section 8.2)</b>
<ul>
<li>Basic Arabic and Syriac character shaping properties, such as initial,
medial and final shapes. These properties are normative for minimal shaping
of Arabic and Syriac. </li>
</ul>
<p><b>NamesList.txt (Chapter 14)</b>
<ul>
<li>This file duplicates some of the material in the UnicodeData file, and
adds informative annotations uses in the character charts, as printed in the
Unicode Standard. </li>
<li><b>Note: </b>The information in NamesList.txt and Index.txt files matches
the appropriate version of the book. Changes in the Unicode Character
Database since then may not be reflected in these files, since they are
primarily of archival interest.</li>
</ul>
<p><b>Index.txt (Chapter 14)</b>
<ul>
<li>Informative index to Unicode characters, as printed in the Unicode
Standard</li>
<li><b>Note: </b>The information in NamesList.txt and Index.txt files matches
the appropriate version of the book. Changes in the Unicode Character
Database since then may not be reflected in these files, since they are
primarily of archival interest.</li>
</ul>
<p><b>CompositionExclusions.txt (<a href="http://www.unicode.org/unicode/reports/tr15/">UTR#15
Unicode Normalization Forms</a>)</b>
<ul>
<li>Normative properties for normalization.</li>
</ul>
<p><b>LineBreak.txt (<a href="http://www.unicode.org/unicode/reports/tr14/">UTR
#14: Line Breaking Properties</a>)</b>
<ul>
<li>Normative and informative properties for line breaking. To see which
properties are informative and which are normative, consult UTR#14.</li>
</ul>
<p><b>EastAsianWidth.txt (<a href="http://www.unicode.org/unicode/reports/tr11/">UTR
#11: East Asian Character Width</a>)</b>
<ul>
<li>Informative properties for determining the choice of wide vs. narrow
glyphs in East Asian contexts.</li>
</ul>
<p><b>diffXvY.txt</b>
<ul>
<li>Mechanically-generated informative files containing accumulated
differences between successive versions of UnicodeData.txt</li>
</ul>
</body>
</html>
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
// natCharacter.cc - Native part of Character class.
/* Copyright (C) 1998, 1999 Free Software Foundation
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
/* java.lang.Character -- Wrapper class for char, and Unicode subsets
Copyright (C) 1998, 1999, 2001, 2002 Free Software Foundation, Inc.
This file is part of GNU Classpath.
GNU Classpath is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
GNU Classpath is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with GNU Classpath; see the file COPYING. If not, write to the
Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA.
Linking this library statically or dynamically with other modules is
making a combined work based on this library. Thus, the terms and
conditions of the GNU General Public License cover the whole
combination.
As a special exception, the copyright holders of this library give you
permission to link this library with independent modules to produce an
executable, regardless of the license terms of these independent
modules, and to copy and distribute the resulting executable under
terms of your choice, provided that you also meet, for each linked
independent module, the terms and conditions of the license of that
module. An independent module is a module which is not derived from
or based on this library. If you modify this library, you may extend
this exception to your version of the library, but you are not
obligated to do so. If you do not wish to do so, delete this
exception statement from your version. */
#include <config.h>
......@@ -18,267 +45,69 @@ details. */
#define asize(x) ((sizeof (x)) / sizeof (x[0]))
static jchar
to_lower_title (jchar ch)
{
for (unsigned int i = 0; i < asize (title_to_upper_table); ++i)
{
// We can assume that the entries in the two tables are
// parallel. This is checked in the script.
if (title_to_upper_table[i][1] == ch
|| title_to_upper_table[i][0] == ch)
return title_to_lower_table[i][1];
}
return ch;
}
static jchar
to_upper_title (jchar ch)
{
for (unsigned int i = 0; i < asize (title_to_lower_table); ++i)
{
// We can assume that the entries in the two tables are
// parallel. This is checked in the script.
if (title_to_lower_table[i][1] == ch
|| title_to_lower_table[i][0] == ch)
return title_to_upper_table[i][1];
}
return ch;
}
jboolean
java::lang::Character::isTitleCase (jchar ch)
{
for (unsigned int i = 0; i < asize (title_to_lower_table); ++i)
{
if (title_to_lower_table[i][0] == ch)
return true;
}
return false;
}
jchar
java::lang::Character::toTitleCase (jchar ch)
java::lang::Character::readChar(jchar ch)
{
// Both titlecase mapping tables have the same length. This is
// checked in the chartables script.
for (unsigned int i = 0; i < asize (title_to_lower_table); ++i)
{
if (title_to_lower_table[i][0] == ch)
return ch;
if (title_to_lower_table[i][1] == ch)
return title_to_lower_table[i][0];
if (title_to_upper_table[i][1] == ch)
return title_to_upper_table[i][0];
}
return toUpperCase (ch);
}
#ifdef COMPACT_CHARACTER
static int
table_search (const jchar table[][2], int table_len, jchar ch)
{
int low, high, i, old;
low = 0;
high = table_len;
i = high / 2;
while (true)
{
if (ch < table[i][0])
high = i;
else if (ch > table[i][1])
low = i;
else
return i;
old = i;
i = (high + low) / 2;
if (i == old)
break;
}
return -1;
}
jint
java::lang::Character::digit_value (jchar ch)
{
int index = table_search (digit_table, asize (digit_table), ch);
if (index == -1)
return -1;
jchar base = digit_table[index][0];
// Tamil doesn't have a digit `0'. So we special-case it here.
if (base == TAMIL_DIGIT_ONE)
return ch - base + 1;
return ch - base;
}
jint
java::lang::Character::getNumericValue (jchar ch)
{
jint d = digit (ch, 36);
if (d != -1)
return d;
for (unsigned int i = 0; i < asize (numeric_table); ++i)
{
if (numeric_table[i] == ch)
return numeric_value[i];
}
return -1;
// Perform 16-bit addition to find the correct entry in data.
return data[(jchar) (blocks[ch >> SHIFT] + ch)];
}
jint
java::lang::Character::getType (jchar ch)
java::lang::Character::getType(jchar ch)
{
int index = table_search (all_table, asize (all_table), ch);
if (index != -1)
return category_table[index];
return UNASSIGNED;
}
jboolean
java::lang::Character::isLowerCase (jchar ch)
{
if (ch >= 0x2000 && ch <= 0x2fff)
return false;
if (table_search (lower_case_table, asize (lower_case_table), ch) != -1)
return true;
int low, high, i, old;
low = 0;
high = asize (lower_anomalous_table);
i = high / 2;
while (true)
{
if (ch < lower_anomalous_table[i])
high = i;
else if (ch > lower_anomalous_table[i])
low = i;
else
return true;
old = i;
i = (high + low) / 2;
if (i == old)
break;
}
return false;
}
jboolean
java::lang::Character::isSpaceChar (jchar ch)
{
return table_search (space_table, asize (space_table), ch) != -1;
}
jboolean
java::lang::Character::isUpperCase (jchar ch)
{
if (ch >= 0x2000 && ch <= 0x2fff)
return false;
return table_search (upper_case_table, asize (upper_case_table), ch) != -1;
// Perform 16-bit addition to find the correct entry in data.
return (jint) (data[(jchar) (blocks[ch >> SHIFT] + ch)] & TYPE_MASK);
}
jchar
java::lang::Character::toLowerCase (jchar ch)
java::lang::Character::toLowerCase(jchar ch)
{
int index = table_search (upper_case_table, asize (upper_case_table), ch);
if (index == -1)
return to_lower_title (ch);
return (jchar) (ch - upper_case_table[index][0]
+ upper_case_map_table[index]);
return (jchar) (ch + lower[readChar(ch) >> 7]);
}
jchar
java::lang::Character::toUpperCase (jchar ch)
java::lang::Character::toUpperCase(jchar ch)
{
int index = table_search (lower_case_table, asize (lower_case_table), ch);
if (index == -1)
return to_upper_title (ch);
return (jchar) (ch - lower_case_table[index][0]
+ lower_case_map_table[index]);
return (jchar) (ch + upper[readChar(ch) >> 7]);
}
#else /* COMPACT_CHARACTER */
jint
java::lang::Character::digit_value (jchar ch)
jchar
java::lang::Character::toTitleCase(jchar ch)
{
if (type_table[ch] == DECIMAL_DIGIT_NUMBER)
return attribute_table[ch];
return -1;
// As title is short, it doesn't hurt to exhaustively iterate over it.
for (int i = title_length - 2; i >= 0; i -= 2)
if (title[i] == ch)
return title[i + 1];
return toUpperCase(ch);
}
jint
java::lang::Character::getNumericValue (jchar ch)
{
jint d = digit (ch, 36);
if (d != -1)
return d;
// Some characters require two attributes. We special-case them here.
if (ch >= ROMAN_START && ch <= ROMAN_END)
return secondary_attribute_table[ch - ROMAN_START];
if (type_table[ch] == LETTER_NUMBER || type_table[ch] == OTHER_NUMBER)
return attribute_table[ch];
return -1;
java::lang::Character::digit(jchar ch, jint radix)
{
if (radix < MIN_RADIX || radix > MAX_RADIX)
return (jint) -1;
jchar attr = readChar(ch);
if (((1 << (attr & TYPE_MASK))
& ((1 << UPPERCASE_LETTER)
| (1 << LOWERCASE_LETTER)
| (1 << DECIMAL_DIGIT_NUMBER))))
{
// Signedness doesn't matter; 0xffff vs. -1 are both rejected.
jint digit = (jint) numValue[attr >> 7];
return (digit >= 0 && digit < radix) ? digit : (jint) -1;
}
return (jint) -1;
}
jint
java::lang::Character::getType (jchar ch)
{
return type_table[ch];
}
jboolean
java::lang::Character::isLowerCase (jchar ch)
{
if (ch >= 0x2000 && ch <= 0x2fff)
return false;
return type_table[ch] == LOWERCASE_LETTER;
}
jboolean
java::lang::Character::isSpaceChar (jchar ch)
{
return (type_table[ch] == SPACE_SEPARATOR
|| type_table[ch] == LINE_SEPARATOR
|| type_table[ch] == PARAGRAPH_SEPARATOR);
}
jboolean
java::lang::Character::isUpperCase (jchar ch)
{
if (ch >= 0x2000 && ch <= 0x2fff)
return false;
return type_table[ch] == UPPERCASE_LETTER;
}
jchar
java::lang::Character::toLowerCase (jchar ch)
java::lang::Character::getNumericValue(jchar ch)
{
if (type_table[ch] == UPPERCASE_LETTER)
return attribute_table[ch];
return to_lower_title (ch);
// numValue is stored as an array of jshort, since 10000 is the maximum.
return (jint) numValue[readChar(ch) >> 7];
}
jchar
java::lang::Character::toUpperCase (jchar ch)
jbyte
java::lang::Character::getDirectionality(jchar ch)
{
if (type_table[ch] == LOWERCASE_LETTER)
return attribute_table[ch];
return to_upper_title (ch);
return direction[readChar(ch) >> 7];
}
#endif /* COMPACT_CHARACTER */
#! /usr/bin/perl
if ($ARGV[0] eq '')
{
$file = 'Blocks.txt';
if (! -f $file)
{
# Too painful to figure out how to get Perl to do it.
system 'wget -o .wget-log http://www.unicode.org/Public/UNIDATA/Blocks.txt';
}
}
else
{
$file = $ARGV[0];
}
open (INPUT, "< $file") || die "couldn't open $file: $!";
@array = ();
while (<INPUT>)
{
next if /^#/;
chop;
next if /^$/;
($start, $to, $text) = split (/; /);
($symbol = $text) =~ tr/a-z/A-Z/;
$symbol =~ s/[- ]/_/g;
# Special case for one of the SPECIALS.
next if $start eq 'FEFF';
# Special case some areas that our heuristic mishandles.
if ($symbol eq 'HIGH_SURROGATES')
{
$symbol = 'SURROGATES_AREA';
$text = 'Surrogates Area';
$to = 'DFFF';
}
elsif ($symbol =~ /SURROGATES/)
{
next;
}
elsif ($symbol eq 'PRIVATE_USE')
{
$symbol .= '_AREA';
$text = 'Private Use Area';
}
printf " public static final UnicodeBlock %s = new UnicodeBlock (\"%s\", '\\u%s', '\\u%s');\n",
$symbol, $text, $start, $to;
push (@array, $symbol);
}
printf " private static final UnicodeBlock[] blocks = {\n";
foreach (@array)
{
printf " %s", $_;
printf "," unless $_ eq 'SPECIALS';
printf "\n";
}
printf " };\n";
close (INPUT);
#!/usr/bin/perl -w
# unicode-blocks.pl -- Script to generate java.lang.Character.UnicodeBlock
# Copyright (C) 2002 Free Software Foundation, Inc.
#
# This file is part of GNU Classpath.
#
# GNU Classpath is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# GNU Classpath is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with GNU Classpath; see the file COPYING. If not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# 02111-1307 USA.
#
# Linking this library statically or dynamically with other modules is
# making a combined work based on this library. Thus, the terms and
# conditions of the GNU General Public License cover the whole
# combination.
#
# As a special exception, the copyright holders of this library give you
# permission to link this library with independent modules to produce an
# executable, regardless of the license terms of these independent
# modules, and to copy and distribute the resulting executable under
# terms of your choice, provided that you also meet, for each linked
# independent module, the terms and conditions of the license of that
# module. An independent module is a module which is not derived from
# or based on this library. If you modify this library, you may extend
# this exception to your version of the library, but you are not
# obligated to do so. If you do not wish to do so, delete this
# exception statement from your version.
# Code for reading Blocks.txt and generating (to standard out) the code for
# java.lang.Character.UnicodeBlock, for pasting into java/lang/Character.java.
# You should probably check that the results are accurate to the
# specification, but I made sure it works OOB for Unicode 3.0.0 and JDK 1.4.
# As the grammar for the Blocks.txt file is changing in Unicode 3.2.0, you
# will have to tweak this some for future use. For now, the relevant
# Unicode definition files are found in libjava/gnu/gcj/convert/.
#
# author Eric Blake <ebb9@email.byu.edu>
#
# usage: unicode-blocks.pl <blocks.txt>
# where <blocks.txt> is obtained from www.unicode.org (named Blocks-3.txt
# for Unicode version 3.0.0).
die "Usage: $0 <blocks.txt>" unless @ARGV == 1;
open (BLOCKS, $ARGV[0]) || die "Can't open Unicode block file: $!\n";
# A hash of added fields and the JDK they were added in, to automatically
# print @since tags. Maintaining this is optional (and tedious), but nice.
my %additions = ("SYRIAC" => "1.4",
"THAANA" => "1.4",
"SINHALA" => "1.4",
"MYANMAR" => "1.4",
"ETHIOPIC" => "1.4",
"CHEROKEE" => "1.4",
"UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS" => "1.4",
"OGHAM" => "1.4",
"RUNIC" => "1.4",
"KHMER" => "1.4",
"MONGOLIAN" => "1.4",
"BRAILLE_PATTERNS" => "1.4",
"CJK_RADICALS_SUPPLEMENT" => "1.4",
"KANGXI_RADICALS" => "1.4",
"IDEOGRAPHIC_DESCRIPTION_CHARACTERS" => "1.4",
"BOPOMOFO_EXTENDED" => "1.4",
"CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A" => "1.4",
"YI_SYLLABLES" => "1.4",
"YI_RADICALS" => "1.4",
);
print <<'EOF';
/**
* A family of character subsets in the Unicode specification. A character
* is in at most one of these blocks.
*
* This inner class was generated automatically from
* <code>$ARGV[0]</code>, by some perl scripts.
* This Unicode definition file can be found on the
* <a href="http://www.unicode.org">http://www.unicode.org</a> website.
* JDK 1.4 uses Unicode version 3.0.0.
*
* @author scripts/unicode-blocks.pl (written by Eric Blake)
* @since 1.2
*/
public static final class UnicodeBlock extends Subset
{
/** The start of the subset. */
private final char start;
/** The end of the subset. */
private final char end;
/**
* Constructor for strictly defined blocks.
*
* @param start the start character of the range
* @param end the end character of the range
* @param name the block name
*/
private UnicodeBlock(char start, char end, String name)
{
super(name);
this.start = start;
this.end = end;
}
/**
* Returns the Unicode character block which a character belongs to.
*
* @param ch the character to look up
* @return the set it belongs to, or null if it is not in one
*/
public static UnicodeBlock of(char ch)
{
// Special case, since SPECIALS contains two ranges.
if (ch == '\uFEFF')
return SPECIALS;
// Simple binary search for the correct block.
int low = 0;
int hi = sets.length - 1;
while (low <= hi)
{
int mid = (low + hi) >> 1;
UnicodeBlock b = sets[mid];
if (ch < b.start)
hi = mid - 1;
else if (ch > b.end)
low = mid + 1;
else
return b;
}
return null;
}
EOF
my $seenSpecials = 0;
my $seenSurrogates = 0;
my $surrogateStart = 0;
my @names = ();
while (<BLOCKS>) {
next if /^\#/;
my ($start, $end, $block) = split(/; /);
next unless defined $block;
chomp $block;
$block =~ s/ *$//;
if (! $seenSpecials and $block =~ /Specials/) {
# Special case SPECIALS, since it is two disjoint ranges
$seenSpecials = 1;
next;
}
if ($block =~ /Surrogates/) {
# Special case SURROGATES_AREA, since it one range, not three
# consecutive, in Java
$seenSurrogates++;
if ($seenSurrogates == 1) {
$surrogateStart = $start;
next;
} elsif ($seenSurrogates == 2) {
next;
} else {
$start = $surrogateStart;
$block = "Surrogates Area";
}
}
# Special case the name of PRIVATE_USE_AREA.
$block =~ s/(Private Use)/$1 Area/;
(my $name = $block) =~ tr/a-z -/A-Z__/;
push @names, $name;
my $since = (defined $additions{$name}
? "\n * \@since $additions{$name}" : "");
my $extra = ($block =~ /Specials/ ? "'\\uFEFF', " : "");
print <<EOF;
/**
* $block.
* $extra'\\u$start' - '\\u$end'.$since
*/
public final static UnicodeBlock $name
= new UnicodeBlock('\\u$start', '\\u$end',
"$name");
EOF
}
print <<EOF;
/**
* The defined subsets.
*/
private static final UnicodeBlock sets[] = {
EOF
foreach (@names) {
print " $_,\n";
}
print <<EOF;
};
} // class UnicodeBlock
EOF
#!/usr/bin/perl -w
# unicode-decomp.pl - script to generate database for java.text.Collator
# Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
#
# This file is part of libjava.
#
# This software is copyrighted work licensed under the terms of the
# Libjava License. Please consult the file "LIBJAVA_LICENSE" for
# details.
# Code for reading UnicodeData.txt and generating the code for
# gnu.java.lang.CharData. For now, the relevant Unicode definition files
# are found in libjava/gnu/gcj/convert/.
#
# Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
# where <UnicodeData.txt> is obtained from www.unicode.org (named
# UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
# is the final location of include/java-chardecomp.h.
# As of JDK 1.4, use Unicode version 3.0.0 for best results.
#
# If this exits with nonzero status, then you must investigate the
# cause of the problem.
# Diagnostics and other information to stderr.
# With -n, the files are not created, but all processing still occurs.
# These maps characters to their decompositions.
my %canonical_decomposition = ();
my %full_decomposition = ();
# Handle `-n' and open output files.
if ($ARGV[0] && $ARGV[0] eq '-n')
{
shift @ARGV;
$ARGV[1] = '/dev/null';
}
die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
# Process the Unicode file.
$| = 1;
my $count = 0;
print STDERR "Parsing attributes file";
while (<UNICODE>)
{
print STDERR "." unless $count++ % 1000;
chomp;
s/\r//g;
my ($ch, undef, undef, undef, undef, $decomp) = split ';';
$ch = hex($ch);
if ($decomp ne '')
{
my $is_full = 0;
my @decomp = ();
foreach (split (' ', $decomp))
{
if (/^\<.*\>$/)
{
$is_full = 1;
next;
}
push (@decomp, hex ($_));
}
my $s = pack "n*", @decomp;
if ($is_full)
{
$full_decomposition{$ch} = $s;
}
else
{
$canonical_decomposition{$ch} = $s;
}
}
}
# Now generate decomposition tables.
open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
print STDERR "\nGenerating tables\n";
print DECOMP <<EOF;
// java-chardecomp.h - Decomposition character tables -*- c++ -*-
#ifndef __JAVA_CHARDECOMP_H__
#define __JAVA_CHARDECOMP_H__
// These tables are automatically generated by the $0
// script. DO NOT EDIT the tables. Instead, fix the script
// and run it again.
// This file should only be included by natCollator.cc
struct decomp_entry
{
jchar key;
const char *value;
};
EOF
&write_decompositions;
print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
close(DECOMP);
print STDERR "Done\n";
exit;
# Write a single decomposition table.
sub write_single_decomposition($$%)
{
my ($name, $is_canon, %table) = @_;
my $first_line = 1;
print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
for my $key (0 .. 0xffff)
{
next if ! defined $table{$key};
print DECOMP ",\n" unless $first_line;
$first_line = 0;
printf DECOMP " { 0x%04x, \"", $key;
# We represent the expansion as a series of bytes, terminated
# with a double nul. This is ugly, but relatively
# space-efficient. Most expansions are short, but there are a
# few that are very long (e.g. \uFDFA). This means that if we
# chose a fixed-space representation we would waste a lot of
# space.
my @expansion = unpack "n*", $table{$key};
foreach my $char (@expansion)
{
printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
}
print DECOMP "\" }";
}
print DECOMP "\n};\n\n";
}
sub write_decompositions()
{
&write_single_decomposition ('canonical', 1, %canonical_decomposition);
&write_single_decomposition ('full', 0, %full_decomposition);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment