123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155 |
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 3 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- use strict;
- use v5.10;
- AddModuleDescription('wordstem.pl', 'WordStemming');
- *OldStemmingResolveId = \&ResolveId;
- *ResolveId = \&NewStemmingResolveId;
- initialise();
- my %StemmedPages = ();
- sub NewStemmingResolveId {
- my $id = shift;
- my ($class, $resolved, $title, $exists) = OldStemmingResolveId($id);
- return ($class, $resolved, $title, $exists) if $resolved;
- if (not %StemmedPages) {
- foreach my $page (AllPagesList()) {
- $StemmedPages{&stemWord($page)} = $page;
- }
- }
- my $page = &stemWord($id);
- if ($StemmedPages{$page}) {
- return ('local stemmed', $StemmedPages{$page}, $StemmedPages{$page}, undef);
- }
- }
- my %step2list;
- my %step3list;
- my ($c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v);
- sub stem
- { my ($stem, $suffix, $firstch);
- my $w = shift;
- if (length($w) < 3) { return $w; } # length at least 3
- # now map initial y to Y so that the patterns never treat it as vowel:
- $w =~ /^./; $firstch = $&;
- if ($firstch =~ /^y/) { $w = ucfirst $w; }
- # Step 1a
- if ($w =~ /(ss|i)es$/) { $w=$`.$1; }
- elsif ($w =~ /([^s])s$/) { $w=$`.$1; }
- # Step 1b
- if ($w =~ /eed$/) { if ($` =~ /$mgr0/) { chop($w); } }
- elsif ($w =~ /(ed|ing)$/)
- { $stem = $`;
- if ($stem =~ /$_v/)
- { $w = $stem;
- if ($w =~ /(at|bl|iz)$/) { $w .= "e"; }
- elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); }
- elsif ($w =~ /^${C}${v}[^aeiouwxy]$/) { $w .= "e"; }
- }
- }
- # Step 1c
- if ($w =~ /y$/) { $stem = $`; if ($stem =~ /$_v/) { $w = $stem."i"; } }
- # Step 2
- if ($w =~ /(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/)
- { $stem = $`; $suffix = $1;
- if ($stem =~ /$mgr0/) { $w = $stem . $step2list{$suffix}; }
- }
- # Step 3
- if ($w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/)
- { $stem = $`; $suffix = $1;
- if ($stem =~ /$mgr0/) { $w = $stem . $step3list{$suffix}; }
- }
- # Step 4
- if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/)
- { $stem = $`; if ($stem =~ /$mgr1/) { $w = $stem; } }
- elsif ($w =~ /(s|t)(ion)$/)
- { $stem = $` . $1; if ($stem =~ /$mgr1/) { $w = $stem; } }
- # Step 5
- if ($w =~ /e$/)
- { $stem = $`;
- if ($stem =~ /$mgr1/ or
- ($stem =~ /$meq1/ and not $stem =~ /^${C}${v}[^aeiouwxy]$/))
- { $w = $stem; }
- }
- if ($w =~ /ll$/ and $w =~ /$mgr1/) { chop($w); }
- # and turn initial Y back to y
- if ($firstch =~ /^y/) { $w = lcfirst $w; }
- return $w;
- }
- sub initialise {
- %step2list =
- ( 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble',
- 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate',
- 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al',
- 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');
- %step3list =
- ('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>'');
- $c = "[^aeiou]"; # consonant
- $v = "[aeiouy]"; # vowel
- $C = "${c}[^aeiouy]*"; # consonant sequence
- $V = "${v}[aeiou]*"; # vowel sequence
- $mgr0 = "^(${C})?${V}${C}"; # [C]VC... is m>0
- $meq1 = "^(${C})?${V}${C}(${V})?" . '$'; # [C]VC[V] is m=1
- $mgr1 = "^(${C})?${V}${C}${V}${C}"; # [C]VCVC... is m>1
- $_v = "^(${C})?${v}"; # vowel in stem
- }
- sub stemWord {
- my $page = shift;
- my $oldpage = $page;
- $page = "";
- # Split the word up at case changes and stem each subword
- my @words = split(/([a-z]*)([A-Z]+[a-z]+)/,$oldpage);
- foreach my $w(@words) {
- if ($w) {
- if ($w =~ /_/) { # Possible word separated by _
- my @subwords = split(/_/,$w);
- foreach my $w(@subwords) {
- if ($w) {
- $page .= lc(&stem($w)); #Force case changes to not matter
- }
- }
- }
- else{
- $page .= lc(&stem($w));
- }
- }
- }
- return $page;
- }
|