#!/usr/bin/perl -w # This script generates utf-8 TeX hyphenation patterns for Greek # Copyright (C) 2006 by Peter Heslin # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. use strict; my $version = '0.1'; use utf8; binmode STDOUT, ":utf8"; sub usage { my $usage = q{ elhyph-utf8 -a | -m | -p [options] This script takes as input a set of TeX hyphenation patterns written for one of the old 8-bit Greek encodings and turns them into UTF-8 patterns for use with a Unicode-aware program, such as XeTeX. The patterns can be adjusted to tolerate a variety of styles of Unicode usage. As a prerequisite you will need to install the patterns from the elhyphen package by Dimitrios Filippou, available from CTAN (tex-archive/language/hyphenation/elhyphen/). The elhyphen patterns must be installed where kpathsea can find them, or else be present in the current directory. You must give one of the options -a, -m, or -p to indicate which set of elhyphen patterns to work from: -a Use GRAhyph4.tex (patterns for ancient, polytonic Greek) -m Use GRMhyph4.tex (patterns for modern, monotonic Greek) -p Use GRPhyph4.tex (patterns for modern Greek written with polytonic accents) With no other options, the default is to output patterns in every permutation of Unicode usage that this program supports. This should be OK, as I do not believe that any of these patterns should interfere with each other. If this generality is not required or to decrease the size of the output, you can specify exactly what style of Unicode input you will be using via the options below. The styles of encoding supported are: -c Use Unicode normalization form C (pre-composed combinations of vowels and diacritics). -d Use Unicode normalization form D (vowels followed by combining diacritics). -X Where there is a choice of using pre-composed code points from either the basic Greek and Coptic code-page (U+0370) or from the Greek Extended code-page (U+1F00), use the basic Greek. (This choice concerns vowels with an acute accent only or an acute plus diaresis.) -x For the choice above, use the equivalent glyph from the Greek Extended code-page. -l Use lunate sigmas in both the medial and final positions. -L Don't allow lunate sigmas. -e In addition to using the plain old "Apostrophe" (U+0027) to indicate elision, support the use of a variety of other apostrophe-like characters for this purpose. The supported characters are "Right Single Quotation Mark" (U+2019 -- this is the preferred usage), "Modifier Letter Apostrophe" (U+02BC), "Greek Koronis" (U+1FBD), and "Greek Psili" (U+1FBF). -E Only use plain apostrophe for elision. }; print $usage; exit; } my (%opt, %seen, %X, %B, @plain_vowels, @other_vowels, @tonos_vowels, @extended_vowels, @oxia_vowels, @accent_combos, $vowels_latin, %vowel_map, $consonants_latin, $accents_latin, %accent_map, @apostrophes); my $argv = join ' ', @ARGV; use Getopt::Std; getopts('ampcdXxlLeE?h', \%opt) or usage(); usage() if $opt{"?"} or $opt{h}; unless (($opt{a} ? 1 : 0) + ($opt{m} ? 1 : 0) + ($opt{p} ? 1 : 0) == 1) { print "You must specify the input patterns via one (and only one) of the -a, -m or -p options\n"; exit; } my $label = $opt{a} ? "Ancient Greek" : $opt{m} ? "Modern Greek" : $opt{p} ? "Modern, Polytonic Greek" : die; # Unless a choice is expressed via the options, default to using all # input-style permutations. unless ($opt{c} or $opt{d}) { $opt{c} = 1; $opt{d} = 1; } if ($opt{c} and not $opt{x} and not $opt{X}) { $opt{x} = 1; $opt{X} = 1; } if (not $opt{c} and ($opt{x} or $opt{X})) { die "Options -x and -X only make sense with option -c\n" } $opt{l} = 1 unless $opt{L}; $opt{e} = 1 unless $opt{E}; my $polytonic = ($opt{a} or $opt{p}) ? 1 : 0; my $file = $opt{a} ? "GRAhyph4.tex" : $opt{m} ? "GRMhyph4.tex" : $opt{p} ? "GRPhyph4.tex" : die "Program Error"; my $path = `kpsewhich $file`; chomp $path; $path = './' . $file unless $path; open IN, $path or die "Could not open file $file at path $path"; sub preamble { my $preamble = qq{ \\message{UTF-8 hyphenation patterns for $label} % This file was mechanically translated from a set of elhyphen patterns % via "elhyph-utf8 $argv" (version $version). \\begingroup }; $preamble .= "\\lccode`'=`'"; if ($opt{e}) { foreach my $a (@apostrophes) { $preamble .= "\\lccode`$a=`$a"; } } $preamble .= "\n\\patterns{%\n"; print $preamble; } initialize(); # Skip over preliminary matter in file -- this will break if patterns # begin on the same line as the patterns macro itself while() {last if m/\\patterns{%/} preamble(); LINE: while (my $line = ) { last LINE if $line =~ m/^}/; if ($line =~ m/^\s*%/) { print $line; next LINE; } my @pats = split ' ', $line; PATTERN: while (@pats) { my $p = shift @pats; if ($p =~ m/%/) { print join ' ', ' ', $p, @pats, "\n"; next LINE ; } # Hyphenation after vowels if ($p =~ m/^($vowels_latin+)1$/) { my $vowel = $1; # Emit full set of pre-combined chars (only once) if ($opt{c} and $vowel eq "a") { print "%-- Vowel patterns (pre-combined)\n"; my @vv = (@plain_vowels, @other_vowels); if ($polytonic) { push @vv, @extended_vowels; if ($opt{X}) { push @vv, @tonos_vowels; } if ($opt{x}) { push @vv, @oxia_vowels; } } else { push @vv, @tonos_vowels; } foreach my $v (@vv) { print $v . "1 "; } print "\n"; } if ($opt{d}) { print "%-- Vowel patterns (combining accents)\n"; foreach my $acc (@accent_combos) { print $vowel_map{$vowel} . "2" . $acc . "1 "; } } } elsif ($p =~ m/^${vowels_latin}2\|1$/) { # We omit the special iota-subscript patterns, since we # deal with these elsewhere next LINE; } elsif ($p eq '>2r1<2r') { # Special case for two rhos with breathings my $r = ''; $r .= 'ῤ1ῥ ' if $opt{c}; $r .= 'ρ2̓1ρ2̔' if $opt{d}; $p = $r; } else { # Convert consonants to utf-8 $p =~ s/($consonants_latin)/$B{$1}/g; # Convert vowels my $c = ''; my $d = ''; # Combining diacritics if ($opt{d}) { $d = $p; $d =~ s/($accents_latin*)($vowels_latin)(\|?)/ my $v = $2; my $a = ($1 || '') . ($3 || ''); $a =~ s#(.)#$accent_map{$1}#g; $vowel_map{$v} . $a; /gex; } # Pre-composed if ($opt{c}) { my $cx = ''; my $cX = ''; # Prefer basic codepage if ($opt{X}) { $cX = $p; $cX =~ s#($accents_latin*$vowels_latin\|?)# exists $B{$1} ? $B{$1} : $X{$1}; #gex; } # Prefer extended codepage if ($opt{x}) { $cx = $p; $cx =~ s/($accents_latin*$vowels_latin\|?)/ exists $X{$1} ? $X{$1} : $B{$1}; /gex; } $c = $cX . ' ' . $cx; } $p = $d . ' ' . $c; } # Apostrophe if ($p =~ /''/) { my $e = $p; $e =~ s/''/'/g; my $E = ''; if ($opt{e}) { foreach my $apos (@apostrophes) { my $tmp = $p; $tmp =~ s/''/$apos/g; $E = $E . $tmp . ' ' ; } } $p = $e . ' ' . $E; } # Lunate sigma if ($opt{l} and $p =~ m/[σς]/) { my $l = $p; $l =~ s/[σς]/$B{lunates}/ge; $p = $p . ' ' . $l; } # Eliminate duplicates foreach my $pat (split ' ', $p) { print $pat . ' ' unless $seen{$pat}; $seen{$pat}++; } } print "\n"; } print "}\n\\endgroup\n\\endinput\n"; close IN; sub initialize { @plain_vowels = qw(α ε η ι ο υ ω); # These do not have equivalents in the extended codepage @other_vowels = qw(ϊ ϋ); # These are used when employing the basic codepage, either alone or mixed @tonos_vowels = qw(ά έ ή ί ό ύ ώ ΐ ΰ); # These are the vowels from the extended page, excluding rho and # chars with oxia that have equivalents on the basic page @extended_vowels = qw(ἀ ἁ ἂ ἃ ἄ ἅ ἆ ἇ ἐ ἑ ἒ ἓ ἔ ἕ ἠ ἡ ἢ ἣ ἤ ἥ ἦ ἧ ἰ ἱ ἲ ἳ ἴ ἵ ἶ ἷ ὀ ὁ ὂ ὃ ὄ ὅ ὐ ὑ ὒ ὓ ὔ ὕ ὖ ὗ ὠ ὡ ὢ ὣ ὤ ὥ ὦ ὧ ὰ ὲ ὴ ὶ ὸ ὺ ὼ ᾀ ᾁ ᾂ ᾃ ᾄ ᾅ ᾆ ᾇ ᾐ ᾑ ᾒ ᾓ ᾔ ᾕ ᾖ ᾗ ᾠ ᾡ ᾢ ᾣ ᾤ ᾥ ᾦ ᾧ ᾲ ᾳ ᾴ ᾶ ᾷ ῂ ῃ ῄ ῆ ῇ ῒ ῖ ῗ ῢ ῦ ῧ ῲ ῳ ῴ ῶ ῷ); # The vowels with oxia excluded above @oxia_vowels = qw(ά έ ή ί ό ύ ώ ΐ ΰ); # Accents combined in the correct, canonical order @accent_combos = qw(́ ̀ ͂ ̓ ̓́ ̓̀ ̓͂ ̔ ̔́ ̔̀ ̔͂ ͅ ́ͅ ̀ͅ ͂ͅ ̓ͅ ̓́ͅ ̓̀ͅ ̓͂ͅ ̔ͅ ̔́ͅ ̔̀ͅ ̔͂ͅ ̈ ̈́ ̈̀ ̈͂ ); $vowels_latin = '[aeiouhwr]'; # vowels plus rho %vowel_map = ( a => "α", e => "ε", h => "η", i => "ι", o => "ο", u => "υ", w => "ω", r => "ρ"); $consonants_latin = '[bgdzjklmnxpcstfqy]'; # minus rho $accents_latin = q([<>'`~"|]); %accent_map = qw(< ̔ > ̓ ' ́ ` ̀ ~ ͂ " ̈ | ͅ ); # Other chars to use for elision @apostrophes = qw(’ ʼ ᾽ ᾿); # Basic Greek codepage (U+0370 - 03FF) $B{q{"'i}} = "ΐ"; $B{q{'a}} = "ά"; $B{q{'e}} = "έ"; $B{q{'h}} = "ή"; $B{q{'i}} = "ί"; $B{q{"'u}} = "ΰ"; $B{a} = "α"; $B{b} = "β"; $B{g} = "γ"; $B{d} = "δ"; $B{e} = "ε"; $B{z} = "ζ"; $B{h} = "η"; $B{j} = "θ"; $B{i} = "ι"; $B{k} = "κ"; $B{l} = "λ"; $B{m} = "μ"; $B{n} = "ν"; $B{x} = "ξ"; $B{o} = "ο"; $B{p} = "π"; $B{r} = "ρ"; $B{c} = "ς"; $B{s} = "σ"; $B{t} = "τ"; $B{u} = "υ"; $B{f} = "φ"; $B{q} = "χ"; $B{y} = "ψ"; $B{w} = "ω"; $B{q{"i}} = "ϊ"; $B{q{"u}} = "ϋ"; $B{q{'o}} = "ό"; $B{q{'u}} = "ύ"; $B{q{'w}} = "ώ"; $B{lunates} = "ϲ"; # Extended Greek codepage (U+1FOO - 1FFFF) $X{q{>a}} = "ἀ"; $X{q{`a}} = "ἂ"; $X{q{<`a}} = "ἃ"; $X{q{>'a}} = "ἄ"; $X{q{<'a}} = "ἅ"; $X{q{>~a}} = "ἆ"; $X{q{<~a}} = "ἇ"; $X{q{>e}} = "ἐ"; $X{q{`e}} = "ἒ"; $X{q{<`e}} = "ἓ"; $X{q{>'e}} = "ἔ"; $X{q{<'e}} = "ἕ"; $X{q{>h}} = "ἠ"; $X{q{`h}} = "ἢ"; $X{q{<`h}} = "ἣ"; $X{q{>'h}} = "ἤ"; $X{q{<'h}} = "ἥ"; $X{q{>~h}} = "ἦ"; $X{q{<~h}} = "ἧ"; $X{q{>i}} = "ἰ"; $X{q{`i}} = "ἲ"; $X{q{<`i}} = "ἳ"; $X{q{>'i}} = "ἴ"; $X{q{<'i}} = "ἵ"; $X{q{>~i}} = "ἶ"; $X{q{<~i}} = "ἷ"; $X{q{>o}} = "ὀ"; $X{q{`o}} = "ὂ"; $X{q{<`o}} = "ὃ"; $X{q{>'o}} = "ὄ"; $X{q{<'o}} = "ὅ"; $X{q{>u}} = "ὐ"; $X{q{`u}} = "ὒ"; $X{q{<`u}} = "ὓ"; $X{q{>'u}} = "ὔ"; $X{q{<'u}} = "ὕ"; $X{q{>~u}} = "ὖ"; $X{q{<~u}} = "ὗ"; $X{q{>w}} = "ὠ"; $X{q{`w}} = "ὢ"; $X{q{<`w}} = "ὣ"; $X{q{>'w}} = "ὤ"; $X{q{<'w}} = "ὥ"; $X{q{>~w}} = "ὦ"; $X{q{<~w}} = "ὧ"; $X{q{`a}} = "ὰ"; $X{q{'a}} = "ά"; $X{q{`e}} = "ὲ"; $X{q{'e}} = "έ"; $X{q{`h}} = "ὴ"; $X{q{'h}} = "ή"; $X{q{`i}} = "ὶ"; $X{q{'i}} = "ί"; $X{q{`o}} = "ὸ"; $X{q{'o}} = "ό"; $X{q{`u}} = "ὺ"; $X{q{'u}} = "ύ"; $X{q{`w}} = "ὼ"; $X{q{'w}} = "ώ"; $X{q{>a|}} = "ᾀ"; $X{q{`a|}} = "ᾂ"; $X{q{<`a|}} = "ᾃ"; $X{q{>'a|}} = "ᾄ"; $X{q{<'a|}} = "ᾅ"; $X{q{>~a|}} = "ᾆ"; $X{q{<~a|}} = "ᾇ"; $X{q{>h|}} = "ᾐ"; $X{q{`h|}} = "ᾒ"; $X{q{<`h|}} = "ᾓ"; $X{q{>'h|}} = "ᾔ"; $X{q{<'h|}} = "ᾕ"; $X{q{>~h|}} = "ᾖ"; $X{q{<~h|}} = "ᾗ"; $X{q{>w|}} = "ᾠ"; $X{q{`w|}} = "ᾢ"; $X{q{<`w|}} = "ᾣ"; $X{q{>'w|}} = "ᾤ"; $X{q{<'w|}} = "ᾥ"; $X{q{>~w|}} = "ᾦ"; $X{q{<~w|}} = "ᾧ"; $X{q{`a|}} = "ᾲ"; $X{q{a|}} = "ᾳ"; $X{q{'a|}} = "ᾴ"; $X{q{~a}} = "ᾶ"; $X{q{~a|}} = "ᾷ"; $X{q{`h|}} = "ῂ"; $X{q{h|}} = "ῃ"; $X{q{'h|}} = "ῄ"; $X{q{~h}} = "ῆ"; $X{q{~h|}} = "ῇ"; $X{q{"`i}} = "ῒ"; $X{q{"'i}} = "ΐ"; $X{q{~i}} = "ῖ"; $X{q{"~i}} = "ῗ"; $X{q{"`u}} = "ῢ"; $X{q{"'u}} = "ΰ"; $X{q{>r}} = "ῤ"; $X{q{