wordlist2hunspell 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. #!/bin/sh
  2. #
  3. # (C) 2008 Caolán McNamara <caolanm@redhat.com>
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 2 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. # This creates a LANG_TERRITORY .aff & .dic from a wordlist.
  19. # It is only a simple wordlist spellchecking dictionary output, no
  20. # knowledge of language rules can be extrapolated to shrink the
  21. # wordlist or provide .aff rules for extending wordstems
  22. if [ $# -lt 2 ]; then
  23. echo "Usage: wordlist2hunspell wordlist_file locale"
  24. echo "e.g. wordlist2hunspell breton.words br_FR to create br_FR.dic and br_FR.aff in cwd"
  25. exit 1
  26. fi
  27. export LANG=$2.utf8
  28. echo "# A basic .aff for a raw wordlist, created through wordlist2hunspell" > $2.aff
  29. echo SET UTF-8 >> $2.aff
  30. #see https://bugzilla.redhat.com/show_bug.cgi?id=462184 for the "C" hacks
  31. echo TRY `sed 's/./&\n/g' $1 | sed '/^$/d' | LC_ALL=C sort -n | LC_ALL=C uniq -c | LC_ALL=C sort -rn | tr -s ' ' | cut -d ' ' -f 3 | tr -d '\n'` >> $2.aff
  32. cat $1 | sed '/^$/d' | wc -l > $2.dic
  33. LC_ALL=C sort $1 | sed '/^$/d' >> $2.dic
  34. echo Basic $2.dic and $2.aff created