123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473 |
- #!/usr/bin/perl -w
- # -*- coding: iso-8859-1 -*-
- # $Id: ispellaff2myspell,v 1.29 2005/07/04 12:21:55 agmartin Exp $
- #
- # (C) 2002-2005 Agustin Martin Domingo <agustin.martin@hispalinux.es>
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- sub usage {
- print "ispellaff2myspell: A program to convert ispell affix tables to myspell format
- (C) 2002-2005 Agustin Martin Domingo <agustin.martin\@hispalinux.es> License: GPL
- Usage:
- ispellaff2myspell [options] <affixfile>
- Options:
- --affixfile=s Affix file
- --bylocale Use current locale setup for upper/lowercase
- conversion
- --charset=s Use specified charset for upper/lowercase
- conversion (defaults to latin1)
- --debug Print debugging info
- --extraflags Allow some non alphabetic flags
- --lowercase=s Lowercase string
- --myheader=s Header file
- --printcomments Print commented lines in output
- --replacements=s Replacements file
- --split=i Split flags with more that i entries
- --uppercase=s Uppercase string
- --wordlist=s Still unused
- Currently allowed valued for charset are: latin1, latin2, latin3
- This script does not create the dict file. Something like
- ( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict
- should do the work, with mydict.words+ being the ispell munched wordlist
- ";
- exit;
- }
- sub debugprint {
- if ( $debug ){
- print STDERR "@_";
- }
- }
- sub shipoutflag{
- my $flag_entries=scalar @flag_array;
-
- if ( $flag_entries != 0 ){
- if ( $split ){
- while ( @flag_array ){
- my @flag_subarray=splice(@flag_array,0,$split);
- my $subflag_entries=scalar @flag_subarray;
- if ( scalar @flag_array ){
- print "$myaffix $flagname $flagcombine $subflag_entries S\n";
- } else {
- print "$myaffix $flagname $flagcombine $subflag_entries\n";
- }
- print join("\n",@flag_subarray);
- print "\n\n";
- }
- } else {
- print "$myaffix $flagname $flagcombine $flag_entries\n";
- print join("\n",@flag_array);
- print "\n\n";
- }
- }
- @flag_array=();
- $flagname='';
- $flagcombine='';
- }
- sub mylc{
- my $inputstring=shift;
- my $outputstring;
- if ( $bylocale ){
- {
- use locale;
- $outputstring = lc $inputstring;
- }
- } else {
- if ( $charset eq "latin0" ){
- $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ½¨¸';
- $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ¼¦´';
- } elsif ( $charset eq "latin1" ){
- $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ';
- $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ';
- } elsif ( $charset eq "latin2" ){
- $lowercase='a-z±³µ¶¹º»¼¾¿àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ';
- $uppercase='A-Z¡£¥¦©ª«¬®¯ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ';
- } elsif ( $charset eq "latin3" ){
- $lowercase='a-z±¶¹º»¼¿àáâäåæçèéêëìíîïñòóôõö÷øùúûüýþ';
- $uppercase='A-Z¡¦©ª«¬¯ÀÁÂÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖ×ØÙÚÛÜÝÞ';
- # } elsif ( $charset eq "other_charset" ){
- # die "latin2 still unimplemented";
- } else {
- if ( not $lowercase and not $uppercase ){
- die "Unsupported charset [$charset]
- use explicitely --lowercase=string and --uppercase=string
- options. Remember that both string must match exactly, but
- case changed.
- ";
- }
- }
- $outputstring=$inputstring;
- eval "\$outputstring=~tr/$uppercase/$lowercase/";
- }
- return $outputstring;
- }
- sub validate_flag (){
- my $flag = shift;
- if ($flag=~m/[a-zA-Z]+/){
- return $flag;
- } elsif ( $hasextraflags ){
- foreach ( keys %theextraflags ){
- if ($flag =~ m/^$_/){
- $flag =~ s/^$_//;
- return $flag;
- }
- }
- }
- return '';
- }
- sub process_replacements{
- my $file = shift;
- my @replaces = ();
-
- open (REPLACE,"< $file") ||
- die "Error: Could not open replacements file: $file\n";
- while (<REPLACE>){
- next unless m/^REP[\s\t]*\D.*/;
- next if m/^REP\s+[0-9]+/;
- s/\015\012//;
- s/\015//;
- chomp;
- push @replaces, $_;
- }
- close REPLACE;
- my $number = scalar @replaces;
- print "REP $number\n";
- foreach ( @replaces ){
- print $_ . "\n";
- }
- }
- # -----------------------------------------------------------
- # Now the progran start, after the functions are defined
- # -----------------------------------------------------------
- use Getopt::Long;
- # Initializing option values
- $affixfile = '';
- $bylocale = '';
- $charset = '';
- $debug = '';
- $lowercase = '';
- $myheader = '';
- $printcomments = '';
- $replacements = '';
- $split = '';
- $uppercase = '';
- $wordlist = '';
- $hasextraflags = '';
- @flag_array = ();
- %theextraflags = ();
- # Initializing root values
- $rootremove = "0";
- $rootname = '';
- $addtoroot = '';
- $comment = '';
- # Initializing flag values
- $flagname = '';
- $flagcombine = '';
- $inflags = '';
- GetOptions ('affixfile=s' => \$affixfile,
- 'bylocale' => \$bylocale,
- 'charset=s' => \$charset,
- 'debug' => \$debug,
- 'extraflags:s' => sub {
- $hasextraflags = 1;
- shift;
- $theflag = shift;
- $theextraflags{$theflag}++ if $theflag},
- 'lowercase=s' => \$lowercase,
- 'myheader=s' => \$myheader,
- 'printcomments' => \$printcomments,
- 'replacements=s'=> \$replacements,
- 'split=i' => \$split,
- 'uppercase=s' => \$uppercase,
- 'wordlist=s' => \$wordlist) or usage;
- if ( not $affixfile ){
- $affixfile=shift or usage;
- }
- if ( $charset and ( $lowercase or $uppercase )){
- die "Error: charset and lowercase/uppercase options
- are incompatible. Use either charset or lowercase/uppercase options to
- specify the patterns
- "
- } elsif ( not $lowercase and not $uppercase and not $charset ){
- $charset="latin1";
- }
- if ( scalar(keys %theextraflags) == 0 && $hasextraflags ){
- $theextraflags{"\\\\"}++;
- }
- debugprint "$affixfile $charset";
- open (AFFIXFILE,"< $affixfile") ||
- die "Error: Could not open affix file: $affixfile";
- if ( $myheader ){
- my $myspell_header=`cat $myheader`;
- print $myspell_header . "\n";
- }
- while (<AFFIXFILE>){
- chomp;
- if (/^\s*\#.*/){
- debugprint "Ignoring line $.\n";
- print "$_\n" if $printcomments;
- } elsif (/^\s*$/){
- debugprint "Ignoring line $.\n";
- } elsif (/^\s*prefixes/){
- debugprint "Prefixes starting in line $.\n";
- $affix="PFX";
- } elsif (/^\s*suffixes/){
- debugprint "Suffixes starting in line $.\n";
- $affix="SFX";
- } elsif (/^[\s\t]*flag.*/){
- next if not $affix; # In case we are still in the preamble
- shipoutflag if $inflags;
- $inflags="yes";
- s/^[\s\t]*flag[\s\t]*//;
- s/[\s\t]*:.*$//;
- debugprint "Found flag $_ in line $.\n";
-
- if (/\*/){
- s/[\*\s]//g;
- $flagcombine="Y";
- debugprint "Flag renamed to $_ with combine=$flagcombine\n";
- } else {
- $flagcombine="N";
- }
-
- if ( $flagname = &validate_flag($_) ){
- $myaffix = $affix;
- } else {
- $myaffix = "\# $affix";
- $flagname = $_;
- print STDERR "Ignoring invalid flag $flagname in line $.\n";
- }
- } elsif ( $affix and $inflags ) {
- ($rootname,@comments) = split('#',$_);
- $comment = '# ' . join('#',@comments);
-
- $rootname =~ s/\s*//g;
- $rootname = mylc $rootname;
- ($rootname,$addtoroot) = split('>',$rootname);
-
- if ( $addtoroot =~ s/^\-//g ){
- ($rootremove,$addtoroot) = split(',',$addtoroot);
- $addtoroot = "0" unless $addtoroot;
- $addtoroot = "0" if ( $addtoroot eq "-");
- } else {
- $rootremove = "0";
- }
- $addtoroot =~ s/\\\-/\-/g; # prefix ANTI\- to anti-
- if ( $rootname eq '.' && $rootremove ne "0" ){
- $rootname = $rootremove;
- }
-
- debugprint "$rootname, $addtoroot, $rootremove\n";
- if ( $printcomments ){
- $affix_line=sprintf("%s %s %-5s %-11s %-24s %s",
- $myaffix, $flagname, $rootremove,
- $addtoroot, $rootname, $comment);
- } else {
- $affix_line=sprintf("%s %s %-5s %-11s %s",
- $myaffix, $flagname, $rootremove,
- $addtoroot, $rootname);
- }
- $rootremove = "0";
- $rootname = '';
- $addtoroot = '';
- $comment = '';
- @comments = ();
- push @flag_array,$affix_line;
- debugprint "$affix_line\n";
- } else {
- #
- }
- }
- shipoutflag;
- close AFFIXFILE;
- if ( $replacements ){
- &process_replacements($replacements);
- }
- __END__
- =head1 NAME
- B<ispellaff2myspell> - A program to convert ispell affix tables to myspell format.
- =head1 SYNOPSIS
- ispellaff2myspell [options] <affixfile> --myheader your_header
- Options:
- --affixfile=s Affix file
- --bylocale Use current locale setup for upper/lowercase
- conversion
- --charset=s Use specified charset for upper/lowercase
- conversion (defaults to latin1)
- --debug Print debugging info
- --extraflags=s Allow some non alphabetic flags
- --lowercase=s Lowercase string
- --myheader=s Header file
- --printcomments Print commented lines in output
- --replacements=s Replacements file
- --split=i Split flags with more that i entries
- --uppercase=s Uppercase string
- =head1 DESCRIPTION
- B<ispellaff2myspell> is a script that will convert ispell affix tables
- to myspell format in a more or less successful way.
- This script does not create the dict file. Something like
- ( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict
- should do the work, with mydict.words+ being the munched wordlist
- =head1 OPTIONS
- =over 8
- =item B<--affixfile=s>
- Affix file. You can put it directly in the command line.
- =item B<--bylocale>
- Use current locale setup for upper/lowercase conversion. Make sure
- that the selected locale match the dictionary one, or you might get
- into trouble.
- =item B<--charset=s>
- Use specified charset for upper/lowercase conversion (defaults to latin1).
- Currently allowed values for charset are: latin0, latin1, latin2, latin3.
- =item B<--debug>
- Print some debugging info.
- =item B<--extraflags:s>
- Allows some non alphabetic flags.
- When invoked with no value the supported flags are currently those
- corresponding to chars represented with the escape char B<\> as
- first char. B<\> will be stripped.
- When given with the flag prefix will allow that flag and strip the
- given prefix. Be careful when giving the prefix to properly escape chars,
- e.g. you will need B<-e "\\\\"> or B<-e '\\'> for flags like B<\[> to be stripped to
- B<[>. Otherwise you might even get errors. Use B<-e "^"> to allow all
- flags and pass them unmodified.
- You will need a call to -e for each flag type, e.g.,
- B<-e "\\\\" -e "~\\\\"> (or B<-e '\\' -e '~\\'>).
- When a prefix is explicitely set, the default value (anything starting by B<\>)
- is disabled and you need to enable it explicitely as in previous example.
- =item B<--lowercase=s>
- Lowercase string. Manually set the string of lowercase chars. This
- requires B<--uppercase> having exactly that string but uppercase.
-
- =item B<--myheader=s>
- Header file. The myspell aff header. You need to write it
- manually. This can contain everything you want to be before the affix table
- =item B<--printcomments>
- Print commented lines in output.
- =item B<--replacements=file>
- Add a pre-defined replacements table taken from 'file' to the .aff file.
- Will skip lines not beginning with REP, and set the replacements number
- appropriately.
- =item B<--split=i>
- Split flags with more that i entries. This can be of interest for flags
- having a lot of entries. Will split the flag in chunks containing B<i>
- entries.
- =item B<--uppercase=s>
- Uppercase string. Manually set the sring of uppercase chars. This
- requires B<--lowercase> having exactly that string but lowercase.
- =back
- If your encoding is currently unsupported you can send me a file with
- the two strings of lower and uppercase chars. Note that they must match
- exactly but case changed. It will look something like
- $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ';
- $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ';
- =head1 SEE ALSO
- The OpenOffice.org Lingucomponent Project home page
- L<http://lingucomponent.openoffice.org/index.html>
- and the document
- L<http://lingucomponent.openoffice.org/affix.readme>
- that provides information about the basics of the myspell affix file format.
- You can also take a look at
- /usr/share/doc/libmyspell-dev/affix.readme.gz
- /usr/share/doc/libmyspell-dev/README.compoundwords
- /usr/share/doc/libmyspell-dev/README.replacetable
- in your Debian system.
- =head1 AUTHORS
- Agustin Martin <agustin.martin@hispalinux.es>
- =cut
|