wordstem.pl 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. # This program is free software; you can redistribute it and/or modify
  2. # it under the terms of the GNU General Public License as published by
  3. # the Free Software Foundation; either version 3 of the License, or
  4. # (at your option) any later version.
  5. #
  6. # This program is distributed in the hope that it will be useful,
  7. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. # GNU General Public License for more details.
  10. #
  11. # You should have received a copy of the GNU General Public License
  12. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. use strict;
  14. use v5.10;
  15. AddModuleDescription('wordstem.pl', 'WordStemming');
  16. *OldStemmingResolveId = \&ResolveId;
  17. *ResolveId = \&NewStemmingResolveId;
  18. initialise();
  19. my %StemmedPages = ();
  20. sub NewStemmingResolveId {
  21. my $id = shift;
  22. my ($class, $resolved, $title, $exists) = OldStemmingResolveId($id);
  23. return ($class, $resolved, $title, $exists) if $resolved;
  24. if (not %StemmedPages) {
  25. foreach my $page (AllPagesList()) {
  26. $StemmedPages{&stemWord($page)} = $page;
  27. }
  28. }
  29. my $page = &stemWord($id);
  30. if ($StemmedPages{$page}) {
  31. return ('local stemmed', $StemmedPages{$page}, $StemmedPages{$page}, undef);
  32. }
  33. }
  34. my %step2list;
  35. my %step3list;
  36. my ($c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v);
  37. sub stem
  38. { my ($stem, $suffix, $firstch);
  39. my $w = shift;
  40. if (length($w) < 3) { return $w; } # length at least 3
  41. # now map initial y to Y so that the patterns never treat it as vowel:
  42. $w =~ /^./; $firstch = $&;
  43. if ($firstch =~ /^y/) { $w = ucfirst $w; }
  44. # Step 1a
  45. if ($w =~ /(ss|i)es$/) { $w=$`.$1; }
  46. elsif ($w =~ /([^s])s$/) { $w=$`.$1; }
  47. # Step 1b
  48. if ($w =~ /eed$/) { if ($` =~ /$mgr0/) { chop($w); } }
  49. elsif ($w =~ /(ed|ing)$/)
  50. { $stem = $`;
  51. if ($stem =~ /$_v/)
  52. { $w = $stem;
  53. if ($w =~ /(at|bl|iz)$/) { $w .= "e"; }
  54. elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); }
  55. elsif ($w =~ /^${C}${v}[^aeiouwxy]$/) { $w .= "e"; }
  56. }
  57. }
  58. # Step 1c
  59. if ($w =~ /y$/) { $stem = $`; if ($stem =~ /$_v/) { $w = $stem."i"; } }
  60. # Step 2
  61. if ($w =~ /(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/)
  62. { $stem = $`; $suffix = $1;
  63. if ($stem =~ /$mgr0/) { $w = $stem . $step2list{$suffix}; }
  64. }
  65. # Step 3
  66. if ($w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/)
  67. { $stem = $`; $suffix = $1;
  68. if ($stem =~ /$mgr0/) { $w = $stem . $step3list{$suffix}; }
  69. }
  70. # Step 4
  71. if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/)
  72. { $stem = $`; if ($stem =~ /$mgr1/) { $w = $stem; } }
  73. elsif ($w =~ /(s|t)(ion)$/)
  74. { $stem = $` . $1; if ($stem =~ /$mgr1/) { $w = $stem; } }
  75. # Step 5
  76. if ($w =~ /e$/)
  77. { $stem = $`;
  78. if ($stem =~ /$mgr1/ or
  79. ($stem =~ /$meq1/ and not $stem =~ /^${C}${v}[^aeiouwxy]$/))
  80. { $w = $stem; }
  81. }
  82. if ($w =~ /ll$/ and $w =~ /$mgr1/) { chop($w); }
  83. # and turn initial Y back to y
  84. if ($firstch =~ /^y/) { $w = lcfirst $w; }
  85. return $w;
  86. }
  87. sub initialise {
  88. %step2list =
  89. ( 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble',
  90. 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate',
  91. 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al',
  92. 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');
  93. %step3list =
  94. ('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>'');
  95. $c = "[^aeiou]"; # consonant
  96. $v = "[aeiouy]"; # vowel
  97. $C = "${c}[^aeiouy]*"; # consonant sequence
  98. $V = "${v}[aeiou]*"; # vowel sequence
  99. $mgr0 = "^(${C})?${V}${C}"; # [C]VC... is m>0
  100. $meq1 = "^(${C})?${V}${C}(${V})?" . '$'; # [C]VC[V] is m=1
  101. $mgr1 = "^(${C})?${V}${C}${V}${C}"; # [C]VCVC... is m>1
  102. $_v = "^(${C})?${v}"; # vowel in stem
  103. }
  104. sub stemWord {
  105. my $page = shift;
  106. my $oldpage = $page;
  107. $page = "";
  108. # Split the word up at case changes and stem each subword
  109. my @words = split(/([a-z]*)([A-Z]+[a-z]+)/,$oldpage);
  110. foreach my $w(@words) {
  111. if ($w) {
  112. if ($w =~ /_/) { # Possible word separated by _
  113. my @subwords = split(/_/,$w);
  114. foreach my $w(@subwords) {
  115. if ($w) {
  116. $page .= lc(&stem($w)); #Force case changes to not matter
  117. }
  118. }
  119. }
  120. else{
  121. $page .= lc(&stem($w));
  122. }
  123. }
  124. }
  125. return $page;
  126. }