Here is a sample of Spanish vocabulary, with the stemmed forms that will be generated by this algorithm:
| word | stem | word | stem | |||||||||
|
che checa checar checo checoslovaquia chedraoui chefs cheliabinsk chelo chemical chemicalweek chemise chepo cheque chequeo cheques cheraw chesca chester chetumal chetumaleños chevrolet cheyene cheyenne chi chiapaneca chiapas chiba chic chica chicago chicana chicano chicas chicharrones chichen chichimecas chicles chico chicos |
⇒ |
che chec chec chec checoslovaqui chedraoui chefs cheliabinsk chel chemical chemicalweek chemis chep chequ cheque chequ cheraw chesc chest chetumal chetumaleñ chevrolet cheyen cheyenn chi chiapanec chiap chib chic chic chicag chican chican chic chicharron chich chichimec chicl chic chic |
torá tos toscano tosferina tostado tota total totales totalidad totalizó totalmente totopos tottenham touché tour tovar toyota toño tpc tqm trabado trabaja trabajaba trabajaban trabajada trabajado trabajador trabajadora trabajadoras trabajadores trabajamos trabajan trabajando trabajar trabajara trabajaron trabajará trabajarán trabajemos trabajen |
⇒ |
tor tos toscan tosferin tost tot total total total totaliz total totop tottenham touch tour tov toyot toñ tpc tqm trab trabaj trabaj trabaj trabaj trabaj trabaj trabaj trabaj trabaj trabaj trabaj trabaj trabaj trabaj trabaj trabaj trabaj trabaj trabaj |
Accents are often missing in informally written Spanish. Since Snowball 3.0.0 some additional rules are included to try to handle commonly occurring cases where accents are omitted and an additional rule doesn't cause problems with other words. There's likely scope for further such improvements - please report instances.
Letters in Spanish include the following accented forms,
The following letters are vowels:
R2 is defined in the usual way — see the note on R1 and R2.
RV is defined as follows (and this is not the same as the French stemmer definition):
If the second letter is a consonant, RV is the region after the next following vowel, or if the first two letters are vowels, RV is the region after the next consonant, and otherwise (consonant-vowel case) RV is the region after the third letter. But RV is the end of the word if these positions cannot be found.
For example,
m a c h o o l i v a t r a b a j o á u r e o
|...| |...| |.......| |...|
Always do steps 0 and 1.
Step 0: Attached pronoun
in RV. In the case of (c), yendo must lie in RV, but the preceding u can be outside it.
In the case of (a), deletion is followed by removing the acute accent (for example, haciéndola → haciendo).
Step 1: Standard suffix removal
Do step 2a if no ending was removed by step 1.
Step 2a: Verb suffixes beginning y
Do Step 2b if step 2a was done, but failed to remove a suffix.
Step 2b: Other verb suffixes
Always do step 3.
Step 3: residual suffix
And finally:
routines (
postlude mark_regions
RV R1 R2
attached_pronoun
standard_suffix
y_verb_suffix
verb_suffix
residual_suffix
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v )
stringescapes {}
/* special characters */
stringdef a' '{U+00E1}' // a-acute
stringdef e' '{U+00E9}' // e-acute
stringdef i' '{U+00ED}' // i-acute
stringdef o' '{U+00F3}' // o-acute
stringdef u' '{U+00FA}' // u-acute
stringdef u" '{U+00FC}' // u-diaeresis
stringdef n~ '{U+00F1}' // n-tilde
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'{a'}' (<- 'a')
'{e'}' (<- 'e')
'{i'}' (<- 'i')
'{o'}' (<- 'o')
'{u'}' (<- 'u')
// and possibly {u"}->u here, or in prelude
'' (next)
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define attached_pronoun as (
[substring] among(
'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
'las' 'les' 'los' 'nos'
)
substring RV among(
'i{e'}ndo' (] <- 'iendo')
'{a'}ndo' (] <- 'ando')
'{a'}r' (] <- 'ar')
'{e'}r' (] <- 'er')
'{i'}r' (] <- 'ir')
'ando'
'iendo'
'ar' 'er' 'ir'
(delete)
'yendo' ('u' delete)
)
)
define standard_suffix as (
[substring] among(
'anza' 'anzas'
'ico' 'ica' 'icos' 'icas'
'ismo' 'ismos'
'able' 'ables'
'ible' 'ibles'
'ista' 'istas'
'oso' 'osa' 'osos' 'osas'
'amiento' 'amientos'
'imiento' 'imientos'
(
R2 delete
)
'adora' 'ador' 'aci{o'}n'
'adoras' 'adores' 'aciones'
'ante' 'antes' 'ancia' 'ancias'
'acion' // Misspelling of '-ación'.
(
R2 delete
try ( ['ic'] R2 delete )
)
'log{i'}a'
'log{i'}as'
(
R2 <- 'log'
)
'uci{o'}n' 'uciones'
'ucion' // Misspelling of '-ución'.
(
R2 <- 'u'
)
'encia' 'encias'
(
R2 <- 'ente'
)
'amente'
(
R1 delete
try (
[substring] R2 delete among(
'iv' (['at'] R2 delete)
'os'
'ic'
'ad'
)
)
)
'mente'
(
R2 delete
try (
[substring] among(
'ante'
'able'
'ible' (R2 delete)
)
)
)
'idad'
'idades'
(
R2 delete
try (
[substring] among(
'abil'
'ic'
'iv' (R2 delete)
)
)
)
'iva' 'ivo'
'ivas' 'ivos'
(
R2 delete
try (
['at'] R2 delete // but not a further ['ic'] R2 delete
)
)
)
)
define y_verb_suffix as (
setlimit tomark pV for ([substring]) among(
'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
'yas' 'yes' 'yais' 'yamos'
('u' delete)
)
)
define verb_suffix as (
setlimit tomark pV for ([substring]) among(
'en' 'es' '{e'}is' 'emos'
(try ('u' test 'g') ] delete)
'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
'ar{e'}'
'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
'er{e'}'
'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
'ir{e'}'
'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
'{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
(delete)
)
)
define residual_suffix as (
[substring] among(
'os'
'a' 'o' '{a'}' '{i'}' '{o'}'
( RV delete )
'e' '{e'}'
( RV delete try( ['u'] test 'g' RV delete ) )
)
)
)
define stem as (
do mark_regions
backwards (
do attached_pronoun
do ( standard_suffix or
y_verb_suffix or
verb_suffix
)
do residual_suffix
)
do postlude
)