From: Sergio Gómez Del Real Subject: [PATCH v2 1/3] tools/make_unicode: Implement full Unicode character decomposition. Message-Id: <20180411142102.4677-1-sdelreal@codeweavers.com> Date: Wed, 11 Apr 2018 09:21:00 -0500 Signed-off-by: Sergio Gómez Del Real --- libs/port/mbtowc.c | 14 +- tools/make_unicode | 423 +++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 370 insertions(+), 67 deletions(-) diff --git a/libs/port/mbtowc.c b/libs/port/mbtowc.c index 4977c82d8b..6976f89722 100644 --- a/libs/port/mbtowc.c +++ b/libs/port/mbtowc.c @@ -22,7 +22,7 @@ #include "wine/unicode.h" -extern unsigned int wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen ) DECLSPEC_HIDDEN; +extern int wine_unicode_decompose_string( int compat, const WCHAR *src, int srclen, WCHAR *dst, int dstlen ); /* check the code whether it is in Unicode Private Use Area (PUA). */ /* MB_ERR_INVALID_CHARS raises an error converting from 1-byte character to PUA. */ @@ -101,19 +101,19 @@ static int mbstowcs_sbcs_decompose( const struct sbcs_table *table, int flags, WCHAR *dst, unsigned int dstlen ) { const WCHAR * const cp2uni = (flags & MB_USEGLYPHCHARS) ? table->cp2uni_glyphs : table->cp2uni; - unsigned int len; + int len; if (!dstlen) /* compute length */ { WCHAR dummy[4]; /* no decomposition is larger than 4 chars */ for (len = 0; srclen; srclen--, src++) - len += wine_decompose( cp2uni[*src], dummy, 4 ); + len += wine_unicode_decompose_string( 0, &cp2uni[*src], 1, dummy, 4 ); return len; } for (len = dstlen; srclen && len; srclen--, src++) { - unsigned int res = wine_decompose( cp2uni[*src], dst, len ); + int res = wine_unicode_decompose_string( 0, &cp2uni[*src], 1, dst, len ); if (!res) break; len -= res; dst += res; @@ -203,7 +203,7 @@ static int mbstowcs_dbcs_decompose( const struct dbcs_table *table, { const WCHAR * const cp2uni = table->cp2uni; const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes; - unsigned int len, res; + int len, res; WCHAR ch; if (!dstlen) /* compute length */ @@ -219,7 +219,7 @@ static int mbstowcs_dbcs_decompose( const struct dbcs_table *table, ch = cp2uni[(off << 8) + *src]; } else ch = cp2uni[*src]; - len += wine_decompose( ch, dummy, 4 ); + len += wine_unicode_decompose_string( 0, &ch, 1, dummy, 4 ); } return len; } @@ -234,7 +234,7 @@ static int mbstowcs_dbcs_decompose( const struct dbcs_table *table, ch = cp2uni[(off << 8) + *src]; } else ch = cp2uni[*src]; - if (!(res = wine_decompose( ch, dst, len ))) break; + if (!(res = wine_unicode_decompose_string( 0, &ch, 1, dst, len ))) break; dst += res; len -= res; } diff --git a/tools/make_unicode b/tools/make_unicode index 92b0b64a94..4ff0b13320 100755 --- a/tools/make_unicode +++ b/tools/make_unicode @@ -471,6 +471,26 @@ sub READ_DEFAULTS($) next if $decomp eq ""; # no decomposition, skip it + # store decomposition table + if ($decomp =~ /^<([a-zA-Z]+)>(\s+[0-9a-fA-F]+)+$/) + { + my @seq = (); + for my $ch (split /\s+/, (split /\s+/, $decomp, 2)[1]) + { + push @seq, (hex $ch); + } + $decomp_table[$src] = [1, \@seq]; + } + elsif ($decomp =~ /^([0-9a-fA-F]+)(\s+([0-9a-fA-F]+))*$/) + { + my @seq = (); + for my $ch (split /\s+/, $decomp) + { + push @seq, (hex $ch); + } + $decomp_table[$src] = [0, \@seq]; + } + if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/) { # decomposition of the form " 1234" -> use char if type is known @@ -508,7 +528,6 @@ sub READ_DEFAULTS($) # store decomposition if it contains two chars if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/) { - $decomp_table[$src] = [ hex $1, hex $2 ]; push @compose_table, [ hex $1, hex $2, $src ]; } elsif ($decomp =~ /^(<[a-z]+>\s)*([0-9a-fA-F]+)$/ && @@ -2258,6 +2277,51 @@ EOF save_file($filename); } +sub do_decomp +{ + my ($char, $table_ref, $compat) = @_; + + return ($char) unless defined $table_ref->[$char]; + my $data = $table_ref->[$char]; + return ($char) if $data->[0] && !$compat; + my @mapping = (); + for my $ch (@{$data->[1]}) + { + push @mapping, $ch; + } + return @mapping; +} + +sub expand_pairs +{ + my @data = @_; + my @result = (); + + for my $ch (@data) + { + if ($ch <= 0xFFFF) + { + push @result, $ch; + } + elsif ($ch >= 2097152) # 2**21 + { + die sprintf "Invalid Unicode character %04x\n", $ch; + } + else + { + my $hx = $ch & 0xFFFF; + my $hu = ($ch >> 16) & ((1 << 5) - 1); + my $hw = ($hu - 1) & 0xFFFF; + my $hi = 0xD800 | ($hw << 6) | ($hx >> 10); + my $lx = $ch & 0xFFFF; + my $lo = (0xDC00 | ($lx & ((1 << 10) - 1))) & 0xFFFF; + push @result, $hi; + push @result, $lo; + } + } + return @result; +} + ################################################################ # dump the char decomposition table sub dump_decompose_table($) @@ -2266,98 +2330,337 @@ sub dump_decompose_table($) open OUTPUT,">$filename.new" or die "Cannot create $filename"; print "Building $filename\n"; - print OUTPUT "/* Unicode char composition */\n"; + print OUTPUT "/* Unicode char decomposition */\n"; print OUTPUT "/* generated from $UNIDATA/UnicodeData.txt */\n"; print OUTPUT "/* DO NOT EDIT!! */\n\n"; print OUTPUT "#include \"wine/unicode.h\"\n\n"; - # first determine all the 16-char subsets that contain something + my $utflim = 2097152; + my %nfd_lookup = (); + my %nfkd_lookup = (); + my %decomp_lookup = (); + my @decomp_data = (0); + my $pos = 1; + my $lastchar_decomp; - my @filled = (0) x 4096; - my $pos = 16*2; # for the null subset - for (my $i = 0; $i < 65536; $i++) + for (my $i = 0; $i < $utflim; $i++) { next unless defined $decomp_table[$i]; - $filled[$i >> 4] = $pos; - $pos += 16*2; - $i |= 15; + + if (defined $decomp_table[$i]) + { + $lastchar_decomp = $i; + # fully expand input and mappings + + my @char = expand_pairs( ($i) ); + push @char, 0; + my $char = pack "n*", @char; + + my @nfd = do_decomp( $i, \@decomp_table, 0 ); + @nfd = expand_pairs( @nfd ); + push @nfd, 0; + my $nfd = pack "n*", @nfd; + + my @nfkd = do_decomp( $i, \@decomp_table, 1 ); + @nfkd = expand_pairs( @nfkd ); + push @nfkd, 0; + my $nfkd = pack "n*", @nfkd; + + # lookup or add mappings + + if ($nfd eq $char) + { + $nfd = undef; + } + elsif (exists $decomp_lookup{$nfd}) + { + $nfd_lookup{$i} = $decomp_lookup{$nfd}; + } + else + { + push @decomp_data, @nfd; + $decomp_lookup{$nfd} = $pos; + $nfd_lookup{$i} = $pos; + $pos += @nfd; + } + + if ($nfkd eq $char) + { + $nfkd = undef; + } + elsif (exists $decomp_lookup{$nfkd}) + { + $nfkd_lookup{$i} = $decomp_lookup{$nfkd}; + } + else + { + push @decomp_data, @nfkd; + $decomp_lookup{$nfkd} = $pos; + $nfkd_lookup{$i} = $pos; + $pos += @nfkd; + } + } } - my $total = $pos; - # now count the 256-char subsets that contain something + printf OUTPUT "static const UINT last_decomposable = 0x%x;\n\n", $lastchar_decomp; + + # dump decomposition data + + printf OUTPUT "static const WCHAR data_decomp[%d] =\n", $pos; + print OUTPUT "{\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @decomp_data ); + print OUTPUT "\n};\n\n"; - my @filled_idx = (256) x 256; - $pos = 256 + 16; - for (my $i = 0; $i < 4096; $i++) + # find 256-char subsets that contain something + + my $filled_pos = 1; + my $filled_lim = ($lastchar_decomp >> 8) + 1; + my @filled = (0) x $filled_lim; + for (my $i = 0; $i < $utflim; $i++) { - next unless $filled[$i]; - $filled_idx[$i >> 4] = $pos; - $pos += 16; - $i |= 15; + last if $i > $lastchar_decomp; + next unless exists $nfd_lookup{$i} || exists $nfkd_lookup{$i}; + $filled[$i >> 8] = $filled_pos++; + $i |= 255; } - my $null_offset = $pos; # null mapping - $total += $pos; - # add the index offsets to the subsets positions + # dump index of 256-char subsets + + printf OUTPUT "static const BYTE idx1_decomp[%d] =\n", $filled_lim; + print OUTPUT "{\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%02x", 0, @filled ); + print OUTPUT "\n};\n\n"; + + # for 256-char subsets, find non-empty 16-char subsets - for (my $i = 0; $i < 4096; $i++) + my $sub_filled_pos = 1; + my %sub_filled = (); + for (my $i = 0; $i < $filled_lim; $i++) { next unless $filled[$i]; - $filled[$i] += $null_offset; + for (my $j = 0; $j < 256; $j++) + { + my $idx = ($i << 8) | $j; + next unless exists $nfd_lookup{$idx} || exists $nfkd_lookup{$idx}; + $sub_filled{$idx >> 4} = $sub_filled_pos++; + $j |= 15; + } } - # dump the main index + # dump index of 16-char subsets - printf OUTPUT "static const WCHAR table[%d] =\n", $total; - printf OUTPUT "{\n /* index */\n"; - printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx ); - printf OUTPUT ",\n /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 ); - - # dump the second-level indexes - - for (my $i = 0; $i < 256; $i++) + printf OUTPUT "static const USHORT idx2_decomp[%d] =\n", $filled_pos * 16; + print OUTPUT "{\n"; + my @null_idx = (0) x 16; + print OUTPUT " /* null sub-index */\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_idx ); + for (my $i = 0; $i < $filled_lim; $i++) { - next unless ($filled_idx[$i] > 256); - my @table = @filled[($i<<4)..($i<<4)+15]; - for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; } - printf OUTPUT ",\n /* sub-index %02x */\n", $i; - printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table ); + next unless $filled[$i]; + printf OUTPUT ",\n /* sub-index 0x%02x */\n", $filled[$i]; + + my @sub_idx; + for (my $j = 0; $j < 16; $j++) + { + my $idx = ($i << 4) | $j; + $sub_idx[$j] = $sub_filled{$idx} || 0; + } + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_idx ); } + print OUTPUT "\n};\n\n"; # dump the 16-char subsets - printf OUTPUT ",\n /* null mapping */\n"; - printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 ); - - for (my $i = 0; $i < 4096; $i++) + printf OUTPUT "static const USHORT offsets_decomp[%d] =\n", 32 * $sub_filled_pos; + print OUTPUT "{\n"; + print OUTPUT " /* (nfd, nfkd) x 16 */\n"; + my @null_table = (0) x 32; + print OUTPUT " /* no decomposition */\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_table ); + for my $key (sort {$a <=> $b} keys %sub_filled) { - next unless $filled[$i]; - my @table = (0) x 32; + printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $key, $key; + my @sub_table; for (my $j = 0; $j < 16; $j++) { - if (defined $decomp_table[($i<<4) + $j]) - { - $table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0]; - $table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1]; - } + my $idx = ($key << 4) | $j; + $sub_table[2 * $j] = $nfd_lookup{$idx} || 0; + $sub_table[2 * $j + 1] = $nfkd_lookup{$idx} || 0; } - printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $i, $i; - printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table ); + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_table ); } + print OUTPUT "\n};\n\n"; - printf OUTPUT "\n};\n\n"; print OUTPUT <<"EOF"; -unsigned int DECLSPEC_HIDDEN wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen ) +static const WCHAR *unicode_table_lookup( UINT cp, int compat, const BYTE *idx1, UINT scale_idx1, + const USHORT *idx2, UINT scale_idx2, const USHORT *offsets, + UINT scale_off, const WCHAR *data, UINT scale_data ) { - const WCHAR *ptr = table + table[table[ch >> 8] + ((ch >> 4) & 0x0f)] + 2 * (ch & 0xf); - unsigned int res; + USHORT a, b, c, d; - *dst = ch; - if (!*ptr) return 1; - if (dstlen <= 1) return 0; - /* apply the decomposition recursively to the first char */ - if ((res = wine_decompose( *ptr, dst, dstlen-1 ))) dst[res++] = ptr[1]; - return res; + a = idx1[cp >> scale_idx1]; + b = idx2[(a << scale_idx2) + ((cp >> scale_idx2) & 0xf)]; + c = (b << scale_off) + ((cp & 0xf) << scale_data); + if (compat) ++c; + d = offsets[c]; + + return &data[d]; +} + +static inline int decompose_hangul( WCHAR ch, WCHAR dum[4], int dstlen ) +{ + static const WCHAR sbase = 0xac00, lbase = 0x1100, vbase = 0x1161, tbase = 0x11a7; + static const WCHAR /*lcount = 19, vcount = 21,*/ tcount = 28, ncount = 588/*, scount = 11172*/; + WCHAR sindex, lindex, vindex, tindex; + + if (ch >= 0xac00 && ch <= 0xd7af) + { + sindex = ch - sbase; + lindex = sindex / ncount; + vindex = (sindex % ncount) / tcount; + tindex = sindex % tcount; + dum[0] = lbase + lindex; + dum[1] = vbase + vindex; + dum[2] = (tindex > 0) ? (tbase + tindex) : 0; + dum[3] = 0; + } + else + { + dum[0] = ch; + dum[1] = 0; + } + + return 0; +} + +static inline UINT utf16_codepoint_to_surrogates( UINT cp ) +{ + UINT ch = cp; + WCHAR hx, hw, lx; + UINT hu; + + hx = (WCHAR)cp; + hu = (cp >> 16) & ((1 << 5) - 1); + hw = (WCHAR)hu - 1; + lx = (WCHAR)cp; + ch = 0xD800 | (hw << 6) | (hx >> 10); + ch |= (0xDC00 | (lx & ((1 << 10) - 1)))<<16; + + return ch; +} + +static inline UINT utf16_surrogates_to_codepoint( WCHAR hi, WCHAR lo ) +{ + UINT x, w, u, c; + + x = ((hi & ((1 << 6) - 1)) << 10) | (lo & ((1 << 10) - 1)); + w = (hi >> 6) & ((1 << 5) - 1); + u = w + 1; + c = (u << 16) | x; + + return c; +} + +static int decompose_char_recursive( int compat, UINT ch, WCHAR *dst, int dstlen ) +{ + int total_decomp = 0; + int size_decomp; + const WCHAR *map; + + if (ch < 0xa0) /* fast path */ + { + if (dstlen) *dst = (WCHAR)ch; + return 1; + } + else if (ch >= 0xac00 && ch <= 0xd7af) /* hangul */ + { + WCHAR dum[4]; + int len = 0, i = 0; + + decompose_hangul( ch, dum, dstlen ); + while (dum[i]) + { + if (dstlen-i) dst[i] = dum[i]; + i++; + len++; + } + return len; + } + else if (ch > last_decomposable || + !*(map = unicode_table_lookup( ch, compat, idx1_decomp, 8, + idx2_decomp, 4, offsets_decomp, 5, data_decomp, 1 ))) + { + if (ch > 0xffff) + { + ch = utf16_codepoint_to_surrogates( ch ); + if (dstlen) *dst = (WCHAR)ch; + if (dstlen-1) *(dst+1) = (WCHAR)(ch>>16); + return 2; + } + else + { + if (dstlen) *dst = (WCHAR)ch; + return 1; + } + } + else { + while (*map) + { + size_decomp = decompose_char_recursive( compat, *map, dst, dstlen ); + dstlen -= size_decomp; + if (dstlen < 0) dstlen = 0; + dst += size_decomp; + map++; + total_decomp += size_decomp; + } + return total_decomp; + } +} + +int wine_unicode_decompose_string( int compat, const WCHAR *src, + int srclen, WCHAR *dst, int dstlen ) +{ + UINT ch; + int srcpos = 0, dstpos = 0; + int decomp_len; + + if (dstlen < 0) dstlen = 0; + + while (srcpos < srclen) + { + ch = src[srcpos]; + + if (ch >= 0xd800 && ch <= 0xdbff) /* high surrogate */ + { + WCHAR hi, lo; + if (srcpos+1 == srclen) return -srcpos; + hi = (WCHAR)ch; + lo = src[++srcpos]; + if (lo < 0xdc00 || lo > 0xdfff) return -srcpos; + ch = utf16_surrogates_to_codepoint( hi, lo ); + } + else if (ch >= 0xdc00 && ch <= 0xdfff) /* low surrogate */ + { + return -srcpos; + } + + decomp_len = decompose_char_recursive( compat, ch, dst+dstpos, dstlen ); + dstpos += decomp_len; + + if (dstlen > 0) + { + dstlen -= decomp_len; + while (dstlen < 0) + { + dstpos--; + dstlen++; + } + } + + ++srcpos; + } + + return dstpos; } EOF close OUTPUT; -- 2.14.1