#!/usr/bin/perl
#΄Α»§³‘Ό₯Θ€ΟEUC
$in="dt";
$out="da";

$dont="M(r|s|rs)|Dr|U\\\.S(\\\.A|)|[aApP]\\\.[mM]|Calif|D\\\.C|N\\\.Y|V[Aa]|[MS][Tt]|Jan|Feb|Mar|Apr|Aug|Sep(t|)|Oct|Nov|Dec|Assoc|[oO]\\\.[kK]|Co|R\\\.V|Gov|Se(n|c)|Ont|U\\\.N|\[A-Z\]|i\\\.e|e\\\.g|v(s|)|Pa|Fla|Re(p|v)|Gen|Univ|Jr|[fF]t|[Ss]gt|[Pp]res|[Pp]rof|esp";

@lst=();

open (FILE, $in);
while (<FILE>) {
	$_ =~ s/^-+\n/\n/;
	$_ =~ s/\t/ /g;
	$_ =~ s/(\.{2,100})([^ ])/$1 $2/g;
	if ($_ =~ /^ {2,300}/) {
		$_ =~ s/^ +/ /;
		$pre = pop(@lst);
		$pre =~ s/\n//;
		push(@lst, $pre);
	} elsif (($_ !~ /^ {2,300}/) && ($_ ne "\n")) {
		#$strange++;
	}
	push(@lst, $_);	
}
close FILE;

#avoid double run
if ($strange > 30) {
	print "something strange in File $t\n";
	exit (1);
}

open (FILE, "> $out");
foreach $con (@lst) {
	print FILE $con;
}
close FILE;

@lst=();

open (FILE, $out) || die "Can't open $out\n$!\n";
$on=0;
while (<FILE>) {
	if ($_ =~ /<text>/) {$on=1;} elsif ($_ =~ /<\/text>/) {$on=0;}
	chomp;
	if ($on == 0) {push (@lst, $_); next;}
	$_=$_."\n" if $_ !~ /<text>/;
	$_ =~ s/‘‘/ /g;
	$_ =~ s/‘Ι|‘Θ/\"/g;
	$_ =~ s/‘Η|‘Ζ/'/g;
	$_ =~ s/ΌΧ/'/g;
	$_ =~ s/^\s+//;
	$_ =~ s/ +/ /g;
	$_ =~ s/ +$//;
	$_ =~ s/:$/;/;
	$_ =~ s/(ET \(.+\))$/$1:/;
	$_ =~ s/ \? / -- /g;
	$_ =~ s/\? /\?\n/g;
	$_ =~ s/! /!\n/g;
	$_ =~ s/(\.|!)\" /$1\"\n/g;
	$_ =~ s/\.\) /\.\)\n/g;
	$_ =~ s/\. /\.\n/g;
	$_ =~ s/\b($dont)\.\n/$1\. /g;
	$_ =~ s/\b([Nn][Oo])\.\n(\d)/$1\. $2/g; #No. 1
	$_ =~ s/ i / I /g;
	$_ =~ s/^i /I /;
	$_ =~ s/\ni /\nI /g;
	$_ =~ s/dont/don't/g;
	$_ =~ s/n;t\b/n't/g;
	$_ =~ s/\b(i')(m|ll|ve|d)\b/I'$2/g;
	$_ =~ s/ΓΥ/'t/g;
	$_ =~ s/ΓΕ/'d/g;
	$_ =~ s/ΓΤ/'s/g;
	$_ =~ s/ΓΣ/'v/g;
	$_ =~ s/ΓΝ/'l/g;
	$_ =~ s/ΓΞ/'m/g;
	$_ =~ s/ΓΣ/'r/g;
	$_ =~ s/‘Κ/\(/g;
	$_ =~ s/‘Λ/\)/g;
	push (@lst, $_);
}
close FILE;


open (FILE, "> $out") || die "Can't $t\n";
foreach $con (@lst) {
	print FILE $con, "\n";
}
close FILE;
