Here is a sample of Spanish vocabulary, with the stemmed forms that will be generated by this algorithm:
word | stem | word | stem | |||||||||
che checa checar checo checoslovaquia chedraoui chefs cheliabinsk chelo chemical chemicalweek chemise chepo cheque chequeo cheques cheraw chesca chester chetumal chetumaleños chevrolet cheyene cheyenne chi chía chiapaneca chiapas chiba chic chica chicago chicana chicano chicas chicharrones chichen chichimecas chicles chico |
⇒ |
che chec chec chec checoslovaqui chedraoui chefs cheliabinsk chel chemical chemicalweek chemis chep chequ cheque chequ cheraw chesc chest chetumal chetumaleñ chevrolet cheyen cheyenn chi chi chiapanec chiap chib chic chic chicag chican chican chic chicharron chich chichimec chicl chic |
torá tórax torcer toreado toreados toreándolo torear toreara torearlo toreó torero toreros torio tormenta tormentas tornado tornados tornar tornen torneo torneos tornillo tornillos torniquete torno toro toronto toros torpedearon torpeza torrado torralba torre torrencial torrenciales torrente torreon torreón torres torrescano |
⇒ |
tor torax torc tor tor tor tor tor tor tore torer torer tori torment torment torn torn torn torn torne torne tornill tornill torniquet torn tor toront tor torped torpez torr torralb torr torrencial torrencial torrent torreon torreon torr torrescan |
Letters in Spanish include the following accented forms,
The following letters are vowels:
R2 is defined in the usual way — see the note on R1 and R2.
RV is defined as follows (and this is not the same as the French stemmer definition):
If the second letter is a consonant, RV is the region after the next following vowel, or if the first two letters are vowels, RV is the region after the next consonant, and otherwise (consonant-vowel case) RV is the region after the third letter. But RV is the end of the word if these positions cannot be found.
For example,
m a c h o o l i v a t r a b a j o á u r e o |...| |...| |.......| |...|
Always do steps 0 and 1.
Step 0: Attached pronoun
in RV. In the case of (c), yendo must lie in RV, but the preceding u can be outside it.
In the case of (a), deletion is followed by removing the acute accent (for example, haciéndola → haciendo).
Step 1: Standard suffix removal
Do step 2a if no ending was removed by step 1.
Step 2a: Verb suffixes beginning y
Do Step 2b if step 2a was done, but failed to remove a suffix.
Step 2b: Other verb suffixes
Always do step 3.
Step 3: residual suffix
And finally:
routines (
postlude mark_regions
RV R1 R2
attached_pronoun
standard_suffix
y_verb_suffix
verb_suffix
residual_suffix
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v )
stringescapes {}
/* special characters */
stringdef a' '{U+00E1}' // a-acute
stringdef e' '{U+00E9}' // e-acute
stringdef i' '{U+00ED}' // i-acute
stringdef o' '{U+00F3}' // o-acute
stringdef u' '{U+00FA}' // u-acute
stringdef u" '{U+00FC}' // u-diaeresis
stringdef n~ '{U+00F1}' // n-tilde
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'{a'}' (<- 'a')
'{e'}' (<- 'e')
'{i'}' (<- 'i')
'{o'}' (<- 'o')
'{u'}' (<- 'u')
// and possibly {u"}->u here, or in prelude
'' (next)
) //or next
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define attached_pronoun as (
[substring] among(
'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
'las' 'les' 'los' 'nos'
)
substring RV among(
'i{e'}ndo' (] <- 'iendo')
'{a'}ndo' (] <- 'ando')
'{a'}r' (] <- 'ar')
'{e'}r' (] <- 'er')
'{i'}r' (] <- 'ir')
'ando'
'iendo'
'ar' 'er' 'ir'
(delete)
'yendo' ('u' delete)
)
)
define standard_suffix as (
[substring] among(
'anza' 'anzas'
'ico' 'ica' 'icos' 'icas'
'ismo' 'ismos'
'able' 'ables'
'ible' 'ibles'
'ista' 'istas'
'oso' 'osa' 'osos' 'osas'
'amiento' 'amientos'
'imiento' 'imientos'
(
R2 delete
)
'adora' 'ador' 'aci{o'}n'
'adoras' 'adores' 'aciones'
'ante' 'antes' 'ancia' 'ancias'// Note 1
(
R2 delete
try ( ['ic'] R2 delete )
)
'log{i'}a'
'log{i'}as'
(
R2 <- 'log'
)
'uci{o'}n' 'uciones'
(
R2 <- 'u'
)
'encia' 'encias'
(
R2 <- 'ente'
)
'amente'
(
R1 delete
try (
[substring] R2 delete among(
'iv' (['at'] R2 delete)
'os'
'ic'
'ad'
)
)
)
'mente'
(
R2 delete
try (
[substring] among(
'ante' // Note 1
'able'
'ible' (R2 delete)
)
)
)
'idad'
'idades'
(
R2 delete
try (
[substring] among(
'abil'
'ic'
'iv' (R2 delete)
)
)
)
'iva' 'ivo'
'ivas' 'ivos'
(
R2 delete
try (
['at'] R2 delete // but not a further ['ic'] R2 delete
)
)
)
)
define y_verb_suffix as (
setlimit tomark pV for ([substring]) among(
'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
'yas' 'yes' 'yais' 'yamos'
('u' delete)
)
)
define verb_suffix as (
setlimit tomark pV for ([substring]) among(
'en' 'es' '{e'}is' 'emos'
(try ('u' test 'g') ] delete)
'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
'ar{e'}'
'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
'er{e'}'
'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
'ir{e'}'
'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
'{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
(delete)
)
)
define residual_suffix as (
[substring] among(
'os'
'a' 'o' '{a'}' '{i'}' '{o'}'
( RV delete )
'e' '{e'}'
( RV delete try( ['u'] test 'g' RV delete ) )
)
)
)
define stem as (
do mark_regions
backwards (
do attached_pronoun
do ( standard_suffix or
y_verb_suffix or
verb_suffix
)
do residual_suffix
)
do postlude
)
/*
Note 1: additions of 15 Jun 2005
*/