#!/usr/bin/perl -w # transliterate: Transliterate ASCII into other scripts # Author: Steven Thomas Smith , 2007-01-01: Version 0.1 # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ################### default arguments and constants #################### $default_language = "arabic"; $default_arabic_flavor = 'arabtex'; $default_arabic_demode = 'novowels'; $default_arabic_enmode = 'novocalize'; # %Encode::Arabic::ArabTeX::modemap = ('default' => 3, # 'fullvocalize' => 4, 'full' => 4 , # 'vocalize' => 3, 'nosukuun' => 3, # 'novocalize' => 2, 'novowels' => 2, 'none' => 2, # 'noshadda' => 1, 'noneplus' => 1, # 'undef' => 0); $default_colloquial_arabic = 1; $default_octal_utf8_flag = 1; $default_latin_transcription_flag = 0; $default_debugging = 0; $language = $default_language; $arabic_flavor = $default_arabic_flavor; $arabic_demode = $default_arabic_demode; $arabic_enmode = $default_arabic_enmode; $colloquial_arabic = $default_colloquial_arabic; $octal_utf8_flag = $default_octal_utf8_flag; $latin_transcription_flag = $default_latin_transcription_flag; $debugging = $default_debugging; $helpme = 1 unless $ARGV[0]; # print help if there are no switches SWITCHES: while ($ARGV[0]) { $_ = shift; /^-A$/ | /^--(Arabic|arabic)$/ | /^--al-9arabiya$/ | /^--al-\`arabiya$/ | /--^\330\247\331\204\330\271\330\261\330\250\331\212\330\251$/ && do { $language = "arabic"; next SWITCHES; }; /^--arabic-flavor$/ && do { $arabic_flavor = shift; next SWITCHES; }; /^--arabic-demode$/ && do { $arabic_demode = shift; next SWITCHES; }; /^--arabic-enmode$/ && do { $arabic_enmode = shift; next SWITCHES; }; /^--colloquial-arabic$/ && do { $colloquial_arabic = !$colloquial_arabic; next SWITCHES; }; /^--octal$/ | /^--octal-utf8$/ && do { $octal_utf8_flag = !$octal_utf8_flag; next SWITCHES; }; /^--latin-transcription$/ && do { $latin_transcription_flag = !$latin_transcription_flag; next SWITCHES; }; /^-G$/ | /^--(Greek|greek)$/ && do { $language = "greek"; next SWITCHES; }; /^-C$/ | /^--(Cyrillic|cyrillic)$/ && do { $language = "cyrillic"; next SWITCHES; }; /^-H$/ | /^--(Hebrew|hebrew)$/ && do { $language = "hebrew"; next SWITCHES; }; /^-d$/ | /^--debug$/ | /^--debugging$/ && do { $debugging = 1; next SWITCHES; }; /^-h$/ | /^--help$/ && do { $helpme = 1; next SWITCHES; }; /^-/ && die sprintf("transliterate: Unknown flag \`%s'.\n" . "Type \`transliterate --help' for help.\n",$_); !/^-/ && die sprintf("transliterate: Unknown argument \`%s'.\n" . "Type \`transliterate --help' for help.\n",$_); } if ($helpme) { $ENV{LESSCHARSET} = "utf-8"; if ($ENV{PAGER}) { # run the user's PAGER open(PAGER,"| $ENV{PAGER}") || die "Cannot run $ENV{PAGER}: $!\n"; } else { open(PAGER,"| more") || open(PAGER,"| less") || open(PAGER,"| cat") || die "Cannot run more or less or cat: $!\n"; } print PAGER <<"HELP STRING"; transliterate: Transliterate ASCII string into the specified script (UTF8 encoding). The command line is: transliterate [OPTIONS] OPTIONS -A --Arabic --arabic --al-9arabiya --al-\`arabiya --\330\247\331\204\330\271\330\261\330\250\331\212\330\251 Transliterate Arabic using the (Levantine) Colloquial Arabic transliteration scheme given in Leslie J. McLoughlin's book (Routledge, 1982), followed by the ArabTeX encoding scheme. This combination gives rapid access to encodings of colloquial Arabic text: 00:\t'\t: hamza\t:: \331\207\331\205\330\262\330\251\t: \330\241 01:\ta\t: 'alif\t:: \330\243\331\204\331\201\t: \330\247 02:\tb\t: baa'\t:: \330\250\330\247\330\241\t: \330\250 03:\tt\t: taa'\t:: \330\252\330\247\330\241\t: \330\252 04:\tth\t: thaa'\t:: \330\253\330\247\330\241\t: \330\253 05:\tj\t: jeem\t:: \330\254\331\212\331\205\t: \330\254 06:\tH\t: Haa'\t:: \330\255\330\247\330\241\t: \330\255 06:\t7\t: 7aa'\t:: \330\255\330\247\330\241\t: \330\255 07:\tkh\t: khaa'\t:: \330\256\330\247\330\241\t: \330\256 08:\td\t: daal\t:: \330\257\330\247\331\204\t: \330\257 09:\tdh\t: dhaal\t:: \330\260\330\247\331\204\t: \330\260 10:\tr\t: raa'\t:: \330\261\330\247\330\241\t: \330\261 11:\tz\t: zaY\t:: \330\262\330\247\331\211\t: \330\262 12:\ts\t: seen\t:: \330\263\331\212\331\206\t: \330\263 13:\tsh\t: sheen\t:: \330\264\331\212\331\206\t: \330\264 14:\tS\t: Saad\t:: \330\265\330\247\330\257\t: \330\265 15:\tD\t: Daad\t:: \330\266\330\247\330\257\t: \330\266 16:\tT\t: Taa'\t:: \330\267\330\247\330\241\t: \330\267 17:\tZ\t: Zaa'\t:: \330\270\330\247\330\241\t: \330\270 18:\t9\t: 9ayn\t:: \330\271\331\212\331\206\t: \330\271 18:\t3\t: 3ayn\t:: \330\271\331\212\331\206\t: \330\271 19:\tgh\t: ghayn\t:: \330\272\331\212\331\206\t: \330\272 20:\tf\t: faa'\t:: \331\201\330\247\330\241\t: \331\201 21:\tq\t: qaaf\t:: \331\202\330\247\331\201\t: \331\202 22:\tk\t: kaaf\t:: \331\203\330\247\331\201\t: \331\203 23:\tl\t: laam\t:: \331\204\330\247\331\205\t: \331\204 24:\tm\t: meem\t:: \331\205\331\212\331\205\t: \331\205 25:\tn\t: noon\t:: \331\206\331\210\331\206\t: \331\206 26:\th\t: haa'\t:: \331\207\330\247\330\241\t: \331\207 27:\tw\t: waw\t:: \331\210\330\247\331\210\t: \331\210 28:\ty\t: yaa'\t:: \331\212\330\247\330\241\t: \331\212 29:\ta\t: fatHa\t:: \331\201\330\252\330\255\330\251\t: \330\271\331\216 30:\ti\t: kesra\t:: \331\203\330\263\330\261\330\251\t: \330\271\331\220 31:\tu\t: Damma\t:: \330\266\331\205\330\251\t: \330\271\331\217 32:\taa\t: 'alif Taweela\t:: \330\243\331\204\331\201\040\330\267\331\210\331\212\331\204\330\251\t: 33:\tee\t: long yaa'\t:: \t: \331\212 34:\too\t: long waw\t:: \t: \331\210\331\217 35:\tay\t: dipthong\t:: \t: \331\212 36:\taw\t: dipthong\t:: \t: \331\210\331\216 37:\to\t: dipthong\t:: \t: 38:\tah\\b\t: taa' marbuTa\t:: \330\252\330\247\330\241\040\331\205\330\261\330\250\331\210\330\267\330\251\t: \330\251 39:\tY\\b\t: 'alif maqSura\t:: \330\243\331\204\331\201\040\331\205\331\202\330\265\331\210\330\261\330\251\t: \331\211 40:\tayu\\b\t: 'alif maqSura\t:: \330\243\331\204\331\201\040\331\205\331\202\330\265\331\210\330\261\330\251\t: \331\211 41:\tawu\\b\t: 'alif maqSura\t:: \330\243\331\204\331\201\040\331\205\331\202\330\265\331\210\330\261\330\251\t: \331\211 42:\tg\t: geem\t:: \332\206\331\212\331\205\t: \332\206 43:\tp\t: paa'\t:: \331\276\330\247\330\241\t: \331\276 44:\tv\t: vaa'\t:: \332\244\330\247\330\241\t: \332\244 Add a hyphen to distingush between th/t-h, kh/k-h, dh/d-h, sh/d-h, and gh/g-h. Note that McLoughlin's and other's transliteration is used where the solitary numerals '9' and '3' represent the consonant \`ayn (\330\271) [because the final part of "nine" sounds like \`ayn]; this is consistent with David Cowan's transliteration scheme: "Orientalists are accustomed to transliterate this letter by an inverted comma but this may encourage the student to neglect it" [Modern Literary Arabic, Cambridge (1958), p. 4]. Also, the solitary numeral '7' represents Haa' (\330\255). The script transliterates these to their ArabTeX equivalents, then decodes using ArabTeX, so all ArabTeX codes will also work, except for tying words together with these letter combinations. To account for this, the script will remove one hyphen between {t,k,d,s,g} and h, i.e., t-h becomes th, t--h becomes t-h, and so forth. For taa' marbuTa (\330\251), use "ah" at the word's end; for 'alif/haa', use ArabTeX's "invisible consonant" `|', i.e., "a|h". For more information see: http://ufal.mff.cuni.cz/padt/PADT_1.0/tools/Encode-Arabic/html/Encode/Arabic/ArabTeX.html ftp://ftp.informatik.uni-stuttgart.de/pub/arabtex/arabtex.htm Note: When the default ArabTeX code is used, the initial transliteration takes an inordinately long time. Be patient; perhaps this will be improved in a future release. --arabic-flavor Flavor of Arabic transliteration: ArabTeX, ArabTeX-DMG, or Buckwalter. [Default: $default_arabic_flavor] --arabic-demode Decoding mode for ASCII transliteration into Arabic. Options are self-explanatory from the mode definition: %%Encode::Arabic::ArabTeX::modemap = ('default' => 3, 'fullvocalize' => 4, 'full' => 4 , 'vocalize' => 3, 'nosukuun' => 3, 'novocalize' => 2, 'novowels' => 2, 'none' => 2, 'noshadda' => 1, 'noneplus' => 1, 'undef' => 0); [Default: $default_arabic_demode] --arabic-enmode Encoding mode from Arabic into a latin transliteration. See --arabic-demode for options. [Default: $default_arabic_enmode] --colloquial-arabic Turn off colloquial Arabic tranliteration; use the transliteration defined by --arabic-demode [or default]. -G --Greek --greek Transliterate Greek using the table: ABCDEFGHIJKLMNOPQRSTUVWXYZ S\\b \316\221\316\222\316\247\316\224\316\225\316\246\316\223\316\227\316\231\316\231\316\232\316\233\316\234\316\235\316\237\316\240\316\230\316\241\316\243\316\244\316\245\316\250\316\251\316\236\316\227\316\226\040\317\232 abcdefghijklmnopqrstuvwxyz s\\b \316\261\316\262\317\207\316\264\316\265\317\206\316\263\316\267\316\271\316\271\316\272\316\273\316\274\316\275\316\277\317\200\316\270\317\201\317\203\317\204\317\205\317\210\317\211\316\276\316\267\316\266\040\317\233 No breathings are implemented -- this is a straight hash table. -C --Cyrillic --cyrillic Transliterate Cyrillic [not yet implemented]. -H --Hebrew --hebrew Transliterate Hebrew [not yet implemented]. --octal --octal-utf8 Print out octal UTF-8 format. [Default: $default_octal_utf8_flag] --latin-transcription Print out phonetic Latin transcription. [Default: $default_latin_transcription_flag] --debug Debug this script. -h --help Print this help string. HELP STRING close(PAGER); exit 0; } #subroutine to convert nonword characters to baskslash octal format sub backslashoctal { my ($str) = @_; $str =~ s/(\W)/sprintf("\\%03o",unpack('C',$&))/ge; return $str; } # Convert Unicode to UTF-8 sub unicode2utf8 { # Convert Unicode into UTF-8; e.g., "aleph" == &unicode2utf8(0x5D0) my ($n) = @_; # See http://en.wikipedia.org/wiki/UTF-8 return ($n < 0x80)? chr $n : ($n < 0x800)? pack('C2', (0b11000000 | (0b00011111 & ($n >> 6)), 0b10000000 | 0b00111111 & $n)) : ($n < 0x10000)? pack('C3', (0b11100000 | (0b00001111 & ($n >> 12)), 0b10000000 | (0b00111111 & ($n >> 6)), 0b10000000 | 0b00111111 & $n)) : ($n < 0x110000)? pack('C4', (0b11110000 | (0b00000111 & ($n >> 18)), 0b10000000 | (0b00111111 & ($n >> 12)), 0b10000000 | (0b00111111 & ($n >> 6)), 0b10000000 | 0b00111111 & $n)) : "" ; } # define language transliteration hashes %alphabet = ( greek => { A => 0x0391, a => 0x03B1, # alpha B => 0x0392, b => 0x03B2, # beta C => 0x03A7, c => 0x03C7, # chi D => 0x0394, d => 0x03B4, # delta E => 0x0395, e => 0x03B5, # epsilon F => 0x03A6, f => 0x03C6, # phi G => 0x0393, g => 0x03B3, # gamma H => 0x0397, h => 0x03B7, # eta I => 0x0399, i => 0x03B9, # iota J => 0x0399, j => 0x03B9, # iota K => 0x039A, k => 0x03BA, # kappa L => 0x039B, l => 0x03BB, # lambda M => 0x039C, m => 0x03BC, # mu N => 0x039D, n => 0x03BD, # nu O => 0x039F, o => 0x03BF, # omicron P => 0x03A0, p => 0x03C0, # pi Q => 0x0398, q => 0x03B8, # theta R => 0x03A1, r => 0x03C1, # rho S => 0x03A3, s => 0x03C3, # sigma T => 0x03A4, t => 0x03C4, # tau U => 0x03A5, u => 0x03C5, # upsilon V => 0x03A8, v => 0x03C8, # psi W => 0x03A9, w => 0x03C9, # omega X => 0x039E, x => 0x03BE, # xsi Y => 0x0397, y => 0x03B7, # eta Z => 0x0396, z => 0x03B6, # zeta Sigma_final => 0x03DA, sigma_final => 0x03DB # final sigma }, cyrillic => { # obviously not yet implemented A => 0x00, a => 0x00, B => 0x00, b => 0x00, C => 0x00, c => 0x00, D => 0x00, d => 0x00, E => 0x00, e => 0x00, F => 0x00, f => 0x00, G => 0x00, g => 0x00, H => 0x00, h => 0x00, I => 0x00, i => 0x00, J => 0x00, j => 0x00, K => 0x00, k => 0x00, L => 0x00, l => 0x00, M => 0x00, m => 0x00, N => 0x00, n => 0x00, O => 0x00, o => 0x00, P => 0x00, p => 0x00, Q => 0x00, q => 0x00, R => 0x00, r => 0x00, S => 0x00, s => 0x00, T => 0x00, t => 0x00, U => 0x00, u => 0x00, V => 0x00, v => 0x00, W => 0x00, w => 0x00, X => 0x00, x => 0x00, Y => 0x00, y => 0x00, Z => 0x00, z => 0x00 }, hebrew => { # obviously not yet implemented A => 0x00, a => 0x00, B => 0x00, b => 0x00, C => 0x00, c => 0x00, D => 0x00, d => 0x00, E => 0x00, e => 0x00, F => 0x00, f => 0x00, G => 0x00, g => 0x00, H => 0x00, h => 0x00, I => 0x00, i => 0x00, J => 0x00, j => 0x00, K => 0x00, k => 0x00, L => 0x00, l => 0x00, M => 0x00, m => 0x00, N => 0x00, n => 0x00, O => 0x00, o => 0x00, P => 0x00, p => 0x00, Q => 0x00, q => 0x00, R => 0x00, r => 0x00, S => 0x00, s => 0x00, T => 0x00, t => 0x00, U => 0x00, u => 0x00, V => 0x00, v => 0x00, W => 0x00, w => 0x00, X => 0x00, x => 0x00, Y => 0x00, y => 0x00, Z => 0x00, z => 0x00 } ); # Arabic transliteration %colloquial_to_ArabTeX_hash = ( th => "_t", # thaa' j => "^g", # jeem H => ".h", # haa' [n.b., this is a synonym for T == taa' marbuTa in ArabTeX] 7 => ".h", # haa' kh => "_h", # khaa' dh => "_d", # dhaal sh => "^s", # sheen S => ".s", # Saad D => ".d", # Daad T => ".t", # Taa' Z => ".z", # Zaa' 9 => "`", # 9ayn 3 => "`", # 3ayn gh => ".g", # ghayn aa => "aa", # 'alif Taweela [NOP] ee => "iy", # long yaa' oo => "uw", # long waw ah => "H", # taa' marbuTa [at word boundary] Y => "Y", # 'alif maqSura [at word boundary] ayu => "_A", # 'alif maqSura [at word boundary] awu => "_A", # 'alif maqSura [at word boundary] g => "^c" # geem ); sub colloquial_to_ArabTeX { my ($str) = @_; $str =~ s/(th|j|H|kh|dh|sh|S|D|T|Z|gh|aa|ee|oo|g)/$colloquial_to_ArabTeX_hash{$1}/g; # letters $str =~ s/((?![0-9.]).|^)([379])((?![0-9.]).|$)/$1$colloquial_to_ArabTeX_hash{$2}$3/g; # 9ayn, 3ayn, 7aa', make sure not a number $str =~ s/(ah|Y|ayu|awu)\b/$colloquial_to_ArabTeX_hash{$1}/g; # letters at word boundary $str =~ s/(tkdsg)\-(\-*)h/$1$2h/g; # remove one hyphen for k-h, d-h, s-h, g-h, retain ArabTeX lengthening return $str; } if (lc($language) eq "arabic") { use Encode::Arabic ':modes'; demode lc($arabic_flavor), lc($arabic_demode); enmode lc($arabic_flavor), lc($arabic_enmode); while ($line = <>) { # renders the ArabTeX notation for Arabic both in the .. chop $line; $line = &colloquial_to_ArabTeX($line) if ($colloquial_arabic); print encode 'utf8', decode lc($arabic_flavor), $line . "\n"; # .. Arabic script proper and the print &backslashoctal(encode 'utf8', decode lc($arabic_flavor), $line) . "\n" if $octal_utf8_flag; # Metaquoted utf8 print encode 'utf8', decode lc($arabic_flavor), $line . "\n" if $latin_transcription_flag; # .. Latin phonetic transcription } exit 0; } else { # use %alphabet hash for all other languages $language = lc $language; while ($line = <>) { chop $line; $line =~ s/([Ss])\b/&unicode2utf8($alphabet{$language}{"$1igma_final"})/ge if (lc($language) eq "greek"); # terminal sigma $line =~ s/([A-Za-z])/&unicode2utf8($alphabet{$language}{$1})/ge; # letters print $line . "\n"; print &backslashoctal($line) . "\n" if $octal_utf8_flag; # Metaquoted utf8 } exit 0; }