unicode-decomp.pl 3.86 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
#!/usr/bin/perl -w
# unicode-decomp.pl - script to generate database for java.text.Collator
# Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
#
# This file is part of libjava.
# 
# This software is copyrighted work licensed under the terms of the
# Libjava License.  Please consult the file "LIBJAVA_LICENSE" for
# details.

# Code for reading UnicodeData.txt and generating the code for
# gnu.java.lang.CharData.  For now, the relevant Unicode definition files
# are found in libjava/gnu/gcj/convert/.
#
# Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
#   where <UnicodeData.txt> is obtained from www.unicode.org (named
#   UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
#   is the final location of include/java-chardecomp.h.
#   As of JDK 1.4, use Unicode version 3.0.0 for best results.
#
# If this exits with nonzero status, then you must investigate the
# cause of the problem.
# Diagnostics and other information to stderr.
# With -n, the files are not created, but all processing still occurs.

# These maps characters to their decompositions.
my %canonical_decomposition = ();
my %full_decomposition = ();

# Handle `-n' and open output files.
if ($ARGV[0] && $ARGV[0] eq '-n')
{
    shift @ARGV;
    $ARGV[1] = '/dev/null';
}
die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";

# Process the Unicode file.
$| = 1;
my $count = 0;
print STDERR "Parsing attributes file";
while (<UNICODE>)
{
    print STDERR "." unless $count++ % 1000;
    chomp;
    s/\r//g;
    my ($ch, undef, undef, undef, undef, $decomp) = split ';';
    $ch = hex($ch);

    if ($decomp ne '')
    {
        my $is_full = 0;
        my @decomp = ();
        foreach (split (' ', $decomp))
        {
            if (/^\<.*\>$/)
            {
                $is_full = 1;
                next;
            }
	    push (@decomp, hex ($_));
	}
        my $s = pack "n*", @decomp;
        if ($is_full)
        {
            $full_decomposition{$ch} = $s;
        }
        else
        {
            $canonical_decomposition{$ch} = $s;
        }
    }
}

# Now generate decomposition tables.
open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
print STDERR "\nGenerating tables\n";
print DECOMP <<EOF;
// java-chardecomp.h - Decomposition character tables -*- c++ -*-

#ifndef __JAVA_CHARDECOMP_H__
#define __JAVA_CHARDECOMP_H__


// These tables are automatically generated by the $0
// script.  DO NOT EDIT the tables.  Instead, fix the script
// and run it again.

// This file should only be included by natCollator.cc

struct decomp_entry
{
  jchar key;
  const char *value;
};

EOF

&write_decompositions;

print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";

close(DECOMP);
print STDERR "Done\n";
exit;


# Write a single decomposition table.
sub write_single_decomposition($$%)
{
    my ($name, $is_canon, %table) = @_;
    my $first_line = 1;
    print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";

    for my $key (0 .. 0xffff)
    {
	next if ! defined $table{$key};
        print DECOMP ",\n" unless $first_line;
	$first_line = 0;

	printf DECOMP "  { 0x%04x, \"", $key;

	# We represent the expansion as a series of bytes, terminated
	# with a double nul.  This is ugly, but relatively
	# space-efficient.  Most expansions are short, but there are a
	# few that are very long (e.g. \uFDFA).  This means that if we
	# chose a fixed-space representation we would waste a lot of
	# space.
	my @expansion = unpack "n*", $table{$key};
	foreach my $char (@expansion)
	{
	    printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
	}

	print DECOMP "\" }";
    }

    print DECOMP "\n};\n\n";
}

sub write_decompositions()
{
    &write_single_decomposition ('canonical', 1, %canonical_decomposition);
    &write_single_decomposition ('full', 0, %full_decomposition);
}