#!/usr/bin/perl -w

# Text2IPA_Polish.pl Version 0.9

# this assumes a UTF-8 file in one word per line format and 
# automatically transcribes Polish words in IPA
# usage:
# Text2IPA_Polish.pl file
# See help (-h) for options

use Getopt::Std;
use utf8;
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");

my $usage;
{
$usage = <<"_USAGE_";
This script attempts to generate an automatic phonetic transcription of Polish orthographic words 
using the International Phonetic Alphabet (IPA).

Notes and assumptions:
- The input text file is encoded in UTF-8 (without BOM)
- Each line of the input file contains exactly one word
- This script does NOT use a lexicon, meaning it will inevitably make 
  MANY mistakes - use it at your own risk! :)

Usage:  Text2IPA_Polish [options] <FILE>

Options and argument:

-h              print this message and quit
-s              Use this to [s]eparate syllables with '.'
-S              [S]tress: mark stressed syllables with a single quotation mark
-n              Keep front [n]asal vowels in word final position

<FILE>    A text file encoded in UTF-8 without BOM, one word per line


Examples:

Read a file and output IPA to standard output:
  Text2IPA_Polish.pl in_utf8.txt

Read a file and output IPA to a file:
  Text2IPA_Polish.pl in_utf8.txt > out_utf8.txt

Keep final front [n]asals: 
  Text2IPA_Polish.pl -n in_utf8.txt > out_utf8.txt
  
Use [S]tress marking and syllable [s]eparators: 
  Text2IPA_Polish.pl -s -S in_utf8.txt > out_utf8.txt
  Text2IPA_Polish.pl -sS in_utf8.txt > out_utf8.txt

Copyright 2011, Amir Zeldes

This program is free software. You may copy or redistribute it under
the same terms as Perl itself.
_USAGE_
}

### OPTIONS BEGIN ###
%opts = ();
getopts('hnSs',\%opts) or die $usage;

#help
if ($opts{h} || (@ARGV == 0)) {
    print $usage;
    exit;
}

#stress marking
$stress = 0;
if ($opts{S}) {$stress = 1;}

#syllable separator
if ($opts{s}) {$syl_separator = ".";}
else {$syl_separator="";}

#Final front nasals
$final_nasal = 0;
if ($opts{n}) {$final_nasal = 1;}


### OPTIONS END ###

open FILE,"<:encoding(UTF-8)",shift or die "could not find input document";
while (<FILE>) {

    chomp;
    $line = $_;

	$line = lc($line);
	
#Nasal vowel before labials (Nasal vowels are realized as a plain vowel plus [m] before labials)
	$line =~ s/ą([bp])/om$1/g;
	$line =~ s/ę([bp])/em$1/g;


#Nasal vowel before dentals (Nasal vowels are realized as a plain vowel plus [n] before dentals)
	$line =~ s/ą([tdc]|cz)/on$1/g;
	$line =~ s/ę([tdc]|cz)/en$1/g;


#Nasal vowel before palato-dentals (Nasal vowels are realized as a plain vowel plus [ń] before palato-dentals)
	$line =~ s/ą(dź|ci|dzi|ć)/oń$1/g;
	$line =~ s/ę(dź|ci|dzi|ć)/eń$1/g;


#Nasal vowel before velars (Nasal vowels are realized as a plain vowel plus [ŋ] before velars)
	$line =~ s/ą(k|g|ch)/oŋ$1/g;
	$line =~ s/ę(k|g|ch)/eŋ$1/g;


#Find syllables (Syllables are recognized using the onset maximization strategy - the longest possible onset is preferred moving from each vowel to the left. Use a hyphen to manually input borders.)
	$line =~ s/-/#/g;
	$line =~ s/([aeiouyóąę])/$1#/g;
	$line =~ s/i#([aeouóąę]#)/i$1/g;
	$line =~ s/(o)#i#/$1i#/g;
	$line =~ s/([aeiouyóąę]+)#([^aeiouyóąę]+)$/$1$2#/g;
	$line =~ s/([aeiouyóąę]+)#([lmnŋrjłń])([^aeiouyóąę]+)/$1$2#$3/g;
	$line =~ s/([aeiouyóąę]+)#(dź)(c)([aeiouyóąę]+)/$1$2#$3$4/g;
	$line =~ s/([aeiouyóąę]+)#([bpm])([bpm])([aeiouyóąę]+)/$1$2#$3$4/g;
	$line =~ s/([aeiouyóąę]+)#([nŋ])([kn])([aeiouyóąę]+)/$1$2#$3$4/g;
	$line =~ s/([ąę])#([^aeiouyóąę])([^aeiouyóąę])/$1$2#$3/g;
	$line =~ s/(^.*$)/#$1#/g;
	$line =~ s/##/#/g;



#Glottal stop (No initial syllable begins with a vowel - the vowel is preceded by a glottal stop by default)
	$line =~ s/^#([aeiouó])/ʔ$1/g;


#Identify palatalization ('i' signals palatalization of the preceding consonant)
	$line =~ s/bi([aeouóęą])/B$1/g;
	$line =~ s/pi([aeouóęą])/P$1/g;
	$line =~ s/wi([aeouóęą])/W$1/g;
	$line =~ s/mi([aeouóęą])/ɱ$1/g;
	$line =~ s/ni([aeouóęą])/ń$1/g;
	$line =~ s/ni/ńi/g;
	$line =~ s/ci([aeouóęą])/ć$1/g;
	$line =~ s/ci/ći/g;
	$line =~ s/zi([aeouóęą])/ź$1/g;
	$line =~ s/zi/źi/g;	
	$line =~ s/si([aeouóęą])/ś$1/g;
	$line =~ s/si/śi/g;


#Progressive devoicing ('w' and 'rz' are devoiced after voiceless consonants)
	$line =~ s/([ptkćsfś]|ch|cz|sz)rz/$1sz/g;
	$line =~ s/([ptkćsfś]|ch|cz|sz)w/$1f/g;


#Regressive voicing and devoicing (Consonants match the voicing of the following consonant)
	$line =~ s/d(#?[ptkćsfcś])/t$1/g;
	$line =~ s/b(#?[ptkćsfcś])/p$1/g;
	$line =~ s/g(#?[ptkćsfcś])/k$1/g;
	$line =~ s/w(#?[ptkćsfcś])/f$1/g;
	$line =~ s/w(#?[ptkćsfcś])/f$1/g;
	$line =~ s/dż(#?[ptkćsfcś])/cz$1/g;
	$line =~ s/dz(#?[ptkćsfcś])/c$1/g;
	$line =~ s/dź(#?[ptkćsfcś])/ć$1/g;
	$line =~ s/(rz|ż)(#?[ptkćsfcś])/sz$2/g;
	$line =~ s/ź(#?[ptkćsfcś])/ś$1/g;
	$line =~ s/z(#?[ptkćsfcś])/s$1/g;
	$line =~ s/p(#?([bdgwżź]|rz))/b$1/g;
	$line =~ s/t(#?([bdgwżź]|rz))/d$1/g;
	$line =~ s/k(#?([bdgwżź]|rz))/g$1/g;
	$line =~ s/s(#?([bdgwżź]|rz))/z$1/g;
	$line =~ s/ś(#?([bdgwżź]|rz))/ź$1/g;
	$line =~ s/c(#?([bdgwżź]|rz))/dz$1/g;
	$line =~ s/ć(#?([bdgwżź]|rz))/dź$1/g;
	$line =~ s/cz(#?([bdgwżź]|rz))/dż$1/g;
	$line =~ s/sz(#?([bdgwżź]|rz))/rz$1/g;


#Nasalized glide (Before spirants and affricates ń is preceded by j)
	$line =~ s/ń(#?(s|ś|sz|c|ć|cz|dz|dż|dź))/jń$1/g;


#Identify consonants (Replace various consonants and digraphs with IPA symbols)
	$line =~ s/w/v/g;
	$line =~ s/ch/x/g;
	$line =~ s/cz/ʧ/g;
	$line =~ s/sz/ʃ/g;
	$line =~ s/dz/ʣ/g;
	$line =~ s/(rz|ż)/ʒ/g;
	$line =~ s/dż/ʤ/g;
	$line =~ s/dź/ʥ/g;
	$line =~ s/ź/ʑ/g;
	$line =~ s/ś/ɕ/g;
	$line =~ s/ć/ʨ/g;
	$line =~ s/c/ʦ/g;
	$line =~ s/ń/ɲ/g;
	$line =~ s/ł/w/g;


#Final voice loss (Voiced obstruents become devoiced in the coda of the final syllable and before the ending -my)
	$line =~ s/b([rw]?)#((my#)?)$/p$1#$2/g;
	$line =~ s/d([rw]?)#((my#)?)$/t$1#$2/g;
	$line =~ s/g([rw]?)#((my#)?)$/k$1#$2/g;
	$line =~ s/z([rw]?)#((my#)?)$/s$1#$2/g;
	$line =~ s/v([rw]?)#((my#)?)$/f$1#$2/g;
	$line =~ s/ʣ([rw]?)#((my#)?)$/ʦ$1#$2/g;
	$line =~ s/ʒ([rw]?)#((my#)?)$/ʃ$1#$2/g;
	$line =~ s/ʑ([rw]?)#((my#)?)$/ɕ$1#$2/g;
	$line =~ s/ʥ([rw]?)#((my#)?)$/ʨ$1#$2/g;
	$line =~ s/ʤ([rw]?)#((my#)?)$/ʧ$1#$2/g;


#Final w drop after consonant ('w' is omitted after a final consonant)
    $line =~ s/([^aeiouyóąę])w#$/$1#/g;


#Denazalize final ę ('ę' is pronounces like 'e' in final position)
if ($final_nasal == 0)
{
    $line =~ s/ę#$/e#/g;
}

#Identify vowels (Replace vowel characters with corresponding IPA symbols)
    $line =~ s/ó/u/g;
    $line =~ s/y/ɨ/g;
    $line =~ s/e/ɛ/g;
    $line =~ s/ę/ɛ̃/g;
    $line =~ s/ą/ɔ̃/g;

#Insert stress if desired, assume paenultimate stress except for aparent 1-2 person plural past tense forms
if ($stress == 1)
{
$line =~ s/^#([^#]+)#$/#'$1#/g; #monosyllables
$line =~ s/#([^#]+)#([^#]+)#$/#'$1#$2#/g; #polysyllables
$line =~ s/#([^#]+)#'([^#]*[lw][iɨ])#(ɕʨɛ|ɕmɨ)#$/#'$1#$2#$3#/g; #fix 1-2 person plural past tense stress
}


#remove initial and final border
$line =~ s/(^#|#$)//g;


#choose syllable marker
$line =~ s/#/$syl_separator/g;


print $line . "\n";

}

