#! /usr/local/bin/perl -w # $Id: generate_normalize_data.pl,v 1.1.1.1 2003-06-04 00:27:55 marka Exp $ # # Copyright (c) 2000,2001 Japan Network Information Center. # All rights reserved. # # By using this file, you agree to the terms and conditions set forth bellow. # # LICENSE TERMS AND CONDITIONS # # The following License Terms and Conditions apply, unless a different # license is obtained from Japan Network Information Center ("JPNIC"), # a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, # Chiyoda-ku, Tokyo 101-0047, Japan. # # 1. Use, Modification and Redistribution (including distribution of any # modified or derived work) in source and/or binary forms is permitted # under this License Terms and Conditions. # # 2. Redistribution of source code must retain the copyright notices as they # appear in each source code file, this License Terms and Conditions. # # 3. Redistribution in binary form must reproduce the Copyright Notice, # this License Terms and Conditions, in the documentation and/or other # materials provided with the distribution. For the purposes of binary # distribution the "Copyright Notice" refers to the following language: # "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." # # 4. The name of JPNIC may not be used to endorse or promote products # derived from this Software without specific prior written approval of # JPNIC. # # 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A # PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF # ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. # # # Generate lib/unicodedata.c from UnicodeData.txt, # CompositionExclusions-1.txt, SpecialCasing.txt and CaseFolding.txt, # all of them available from ftp://ftp.unicode.org/Public/UNIDATA/. # use strict; use lib qw(.); use Getopt::Long; use UCD; use SparseMap; use constant UCS_MAX => 0x110000; use constant END_BIT => 0x80000000; my $DECOMP_COMPAT_BIT = 0x8000; my $CASEMAP_FINAL_BIT = 0x1; my $CASEMAP_NONFINAL_BIT = 0x2; my $CASEMAP_LAST_BIT = 0x10; my $LETTER_BIT = 1; my $NSPMARK_BIT = 2; (my $myid = '$Id: generate_normalize_data.pl,v 1.1.1.1 2003-06-04 00:27:55 marka Exp $') =~ s/\$([^\$]+)\$/\$-$1-\$/; my @default_bits = (9, 7, 5); #my @default_bits = (7, 7, 7); my @canon_class_bits = @default_bits; my @decomp_bits = @default_bits; my @comp_bits = @default_bits; my @folding_bits = @default_bits; my @casemap_bits = @default_bits; my @casemap_ctx_bits = @default_bits; my $prefix = ''; my $dir = '.'; my $unicodedatafile = 'UnicodeData.txt'; my $exclusionfile = 'CompositionExclusions.txt'; my $specialcasefile = 'SpecialCasing.txt'; my $casefoldingfile = 'CaseFolding.txt'; my $verbose; GetOptions('dir|d=s' => \$dir, 'unicodedata|u=s' => \$unicodedatafile, 'exclude|e=s' => \$exclusionfile, 'specialcase|s=s' => \$specialcasefile, 'casefold|c=s' => \$casefoldingfile, 'prefix|p=s' => \$prefix, 'verbose|v' => \$verbose, ) or usage(); foreach my $r (\$unicodedatafile, \$exclusionfile, \$specialcasefile, \$casefoldingfile) { $$r = "$dir/$$r" unless $$r =~ m|^/|; } my %exclusions; my %lower_special; my %upper_special; my @decomp_data; my @comp_data; my @toupper_data; my @tolower_data; my @folding_data; # # Create Mapping/Bitmap objects. # # canonical class my $canon_class = SparseMap::Int->new(BITS => [@canon_class_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0); # canonical/compatibility decomposition my $decomp = SparseMap::Int->new(BITS => [@decomp_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0); # canonical composition my $comp = SparseMap::Int->new(BITS => [@comp_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0); # uppercase/lowercase my $upper = SparseMap::Int->new(BITS => [@casemap_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0); my $lower = SparseMap::Int->new(BITS => [@casemap_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0); # final/nonfinal context my $casemap_ctx = SparseMap::Int->new(BITS => [@casemap_ctx_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0); # casefolding my $folding = SparseMap::Int->new(BITS => [@folding_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0); # # Read datafiles. # read_exclusion_file(); read_specialcasing_file(); read_unicodedata_file(); read_casefolding_file(); print_header(); print_canon_class(); print_composition(); print_decomposition(); print_casemap(); print_casemap_context(); print_casefolding(); exit; sub usage { print STDERR <<"END"; Usage: $0 [options..] options: -d DIR directory where Unicode Character Data files resides [./] -u FILE name of the UnicodeData file [UnicodeData.txt] -e FILE name of the CompositionExclusion file [CompositionExclusions-1.txt] -s FILE name of the SpecialCasing file [SpecialCasing.txt] -c FILE name of the CaseFolding file [CaseFolding.txt] END exit 1; } # # read_exclusion_file -- read CompositionExclusions-1.txt. # sub read_exclusion_file { open EXCLUDE, $exclusionfile or die "cannot open $exclusionfile: $!\n"; while ($_ = UCD::CompositionExclusions::getline(\*EXCLUDE)) { my %data = UCD::CompositionExclusions::parseline($_); $exclusions{$data{CODE}} = 1; } close EXCLUDE; } # # read_specialcasing_file -- read SpecialCasing.txt # sub read_specialcasing_file { open SPCASE, $specialcasefile or die "cannot open $specialcasefile: $!\n"; while ($_ = UCD::SpecialCasing::getline(\*SPCASE)) { my %data = UCD::SpecialCasing::parseline($_); my $code = $data{CODE}; my $lower = $data{LOWER}; my $upper = $data{UPPER}; my $cond = $data{CONDITION} || ''; next unless $cond eq '' or $cond =~ /^(NON_)?FINAL/; if (defined $cond && (@$lower > 1 || $lower->[0] != $code) or @$lower > 1 or $lower->[0] != $code) { $lower_special{$code} = [$lower, $cond]; } if (defined $cond && (@$upper > 1 || $upper->[0] != $code) or @$upper > 1 or $upper->[0] != $code) { $upper_special{$code} = [$upper, $cond]; } } close SPCASE; } # # read_unicodedata_file -- read UnicodeData.txt # sub read_unicodedata_file { open UCD, $unicodedatafile or die "cannot open $unicodedatafile: $!\n"; @decomp_data = (0); @toupper_data = (0); @tolower_data = (0); my @comp_cand; # canonical composition candidates my %nonstarter; while ($_ = UCD::UnicodeData::getline(\*UCD)) { my %data = UCD::UnicodeData::parseline($_); my $code = $data{CODE}; # combining class if ($data{CLASS} > 0) { $nonstarter{$code} = 1; $canon_class->add($code, $data{CLASS}); } # uppercasing if (exists $upper_special{$code} or defined $data{UPPER}) { my $offset = @toupper_data; my @casedata; $upper->add($code, $offset); if (exists $upper_special{$code}) { push @casedata, $upper_special{$code}; } if (defined $data{UPPER}) { push @casedata, $data{UPPER}; } push @toupper_data, casemap_data(@casedata); } # lowercasing if (exists $lower_special{$code} or defined $data{LOWER}) { my $offset = @tolower_data; my @casedata; $lower->add($code, $offset); if (exists $lower_special{$code}) { push @casedata, $lower_special{$code}; } if (defined $data{LOWER}) { push @casedata, $data{LOWER}; } push @tolower_data, casemap_data(@casedata); } # composition/decomposition if ($data{DECOMP}) { my ($tag, @decomp) = @{$data{DECOMP}}; my $offset = @decomp_data; # composition if ($tag eq '' and @decomp > 1 and not exists $exclusions{$code}) { # canonical composition candidate push @comp_cand, [$code, @decomp]; } # decomposition if ($tag ne '') { # compatibility decomposition $offset |= $DECOMP_COMPAT_BIT; } $decomp->add($code, $offset); push @decomp_data, @decomp; $decomp_data[-1] |= END_BIT; } # final/nonfinal context if ($data{CATEGORY} =~ /L[ult]/) { $casemap_ctx->add($code, $LETTER_BIT); } elsif ($data{CATEGORY} eq 'Mn') { $casemap_ctx->add($code, $NSPMARK_BIT); } } close UCD; # Eliminate composition candidates whose decomposition starts with # a non-starter. @comp_cand = grep {not exists $nonstarter{$_->[1]}} @comp_cand; @comp_data = ([0, 0, 0]); my $last_code = -1; my $last_offset = @comp_data; for my $r (sort {$a->[1] <=> $b->[1] || $a->[2] <=> $b->[2]} @comp_cand) { if ($r->[1] != $last_code) { $comp->add($last_code, ($last_offset | ((@comp_data - $last_offset)<<16))) unless $last_code == -1; $last_code = $r->[1]; $last_offset = @comp_data; } push @comp_data, $r; } $comp->add($last_code, ($last_offset | ((@comp_data - $last_offset)<<16))); } sub casemap_data { my @data = @_; my @result = (); while (@data > 0) { my $r = shift @data; my $flag = 0; if (ref $r) { if ($r->[1] eq 'FINAL') { $flag |= $CASEMAP_FINAL_BIT; } elsif ($r->[1] eq 'NON_FINAL') { $flag |= $CASEMAP_NONFINAL_BIT; } elsif ($r->[1] ne '') { die "unknown condition \"", $r->[1], "\"\n"; } } $flag |= $CASEMAP_LAST_BIT if @data == 0; push @result, $flag; push @result, (ref $r) ? @{$r->[0]} : $r; $result[-1] |= END_BIT; } @result; } # # read_casefolding_file -- read CaseFolding.txt # sub read_casefolding_file { open FOLD, $casefoldingfile or die "cannto open $casefoldingfile: $!\n"; # dummy. @folding_data = (0); while ($_ = UCD::CaseFolding::getline(\*FOLD)) { my %data = UCD::CaseFolding::parseline($_); $folding->add($data{CODE}, scalar(@folding_data)); push @folding_data, @{$data{MAP}}; $folding_data[-1] |= END_BIT; } close FOLD; } sub print_header { print <<"END"; /* \$Id\$ */ /* $myid */ /* * Do not edit this file! * This file is generated from UnicodeData.txt, CompositionExclusions-1.txt, * SpecialCasing.txt and CaseFolding.txt. */ END } # # print_canon_class -- generate data for canonical class # sub print_canon_class { $canon_class->fix(); print STDERR "** cannon_class\n", $canon_class->stat() if $verbose; print <<"END"; /* * Canonical Class */ END print_bits("CANON_CLASS", @canon_class_bits); print "\n"; print $canon_class->cprog(NAME => "${prefix}canon_class"); } # # print_composition -- generate data for canonical composition # sub print_composition { $comp->fix(); print STDERR "** composition\n", $comp->stat() if $verbose; print <<"END"; /* * Canonical Composition */ END print_bits("CANON_COMPOSE", @comp_bits); print "\n"; print $comp->cprog(NAME => "${prefix}compose"); print <<"END"; static const struct composition ${prefix}compose_seq[] = { END my $i = 0; foreach my $r (@comp_data) { if ($i % 2 == 0) { print "\n" if $i != 0; print "\t"; } printf "{ 0x%08x, 0x%08x }, ", $r->[2], $r->[0]; $i++; } print "\n};\n\n"; } # # print_decomposition -- generate data for canonical/compatibility # decomposition # sub print_decomposition { $decomp->fix(); print STDERR "** decomposition\n", $decomp->stat() if $verbose; print <<"END"; /* * Canonical/Compatibility Decomposition */ END print_bits("DECOMP", @decomp_bits); print "#define DECOMP_COMPAT\t$DECOMP_COMPAT_BIT\n\n"; print $decomp->cprog(NAME => "${prefix}decompose"); print "static const unsigned long ${prefix}decompose_seq[] = {\n"; print_ulseq(@decomp_data); print "};\n\n"; } # # print_casemap -- generate data for case mapping # sub print_casemap { $upper->fix(); $lower->fix(); print STDERR "** upper mapping\n", $upper->stat() if $verbose; print STDERR "** lower mapping\n", $lower->stat() if $verbose; print <<"END"; /* * Lowercase <-> Uppercase mapping */ /* * Flags for special case mapping. */ #define CMF_FINAL $CASEMAP_FINAL_BIT #define CMF_NONFINAL $CASEMAP_NONFINAL_BIT #define CMF_LAST $CASEMAP_LAST_BIT #define CMF_CTXDEP (CMF_FINAL|CMF_NONFINAL) END print_bits("CASEMAP", @casemap_bits); print "\n"; print $upper->cprog(NAME => "${prefix}toupper"); print $lower->cprog(NAME => "${prefix}tolower"); print "static const unsigned long ${prefix}toupper_seq[] = {\n"; print_ulseq(@toupper_data); print "};\n\n"; print "static const unsigned long ${prefix}tolower_seq[] = {\n"; print_ulseq(@tolower_data); print "};\n\n"; } # # print_casefolding -- generate data for case folding # sub print_casefolding { $folding->fix(); print STDERR "** case folding\n", $folding->stat() if $verbose; print <<"END"; /* * Case Folding */ END print_bits("CASE_FOLDING", @folding_bits); print "\n"; print $folding->cprog(NAME => "${prefix}case_folding"); print "static const unsigned long ${prefix}case_folding_seq[] = {\n"; print_ulseq(@folding_data); print "};\n\n"; } # # print_casemap_context -- gerarate data for determining context # (final/non-final) # sub print_casemap_context { $casemap_ctx->fix(); print STDERR "** casemap context\n", $casemap_ctx->stat() if $verbose; print <<"END"; /* * Cased characters and non-spacing marks (for casemap context) */ END print_bits("CASEMAP_CTX", @casemap_ctx_bits); print <<"END"; #define CTX_CASED $LETTER_BIT #define CTX_NSM $NSPMARK_BIT END print $casemap_ctx->cprog(NAME => "${prefix}casemap_ctx"); } sub sprint_composition_hash { my $i = 0; my $s = ''; foreach my $r (@_) { if ($i % 2 == 0) { $s .= "\n" if $i != 0; $s .= "\t"; } $s .= sprintf "{0x%04x, 0x%04x, 0x%04x}, ", @{$r}; $i++; } $s; } sub print_bits { my $prefix = shift; my $i = 0; foreach my $bit (@_) { print "#define ${prefix}_BITS_$i\t$bit\n"; $i++; } } sub print_ulseq { my $i = 0; foreach my $v (@_) { if ($i % 4 == 0) { print "\n" if $i != 0; print "\t"; } printf "0x%08x, ", $v; $i++; } print "\n"; }