Here is a sample of Esperanto vocabulary, with the stemmed forms that will be generated by this algorithm:
word | stem | word | stem | |||||||||
abajo abako abandonante abandonas abandoni abandonis abandonita abandonitaj abandono abasida abasidaj abasidoj abata abate abateja abatejo |
⇒ |
abaj abak abandonant abandon abandon abandon abandonit abandonit abandon abasid abasid abasid abat abat abatej abatej |
kvazaŭ kvazaŭa kvazaŭaj kvazaŭan kvazaŭarmea kvazaŭdeciduaj kvazaŭe kvazaŭregulaj kvazaŭĉiamverdaj kvena kvenja kverado kverelas kvereli kverelis kverelo |
⇒ |
kvazaŭ kvazaŭ kvazaŭ kvazaŭ kvazaŭarme kvazaŭdecidu kvazaŭ kvazaŭregul kvazaŭĉiamverd kven kvenj kverad kverel kverel kverel kverel |
Letters in Esperanto include the following accented forms:
The following letters are vowels:
The algorithm removes suffixes for part of speech ("-a", "-e", "-i", "-o"), verb inflection ("-as", "-is", "-os", "-u", "-us"), number ("-j"), and case ("-n"). That covers most words in Esperanto. The rest of the algorithm covers various exceptions and uncommon patterns. Unofficial words are supported when unambiguous.
The verbal suffixes "-ant", "-int", and "-ont" and prefixes "ek-" and "el-" are not removed, because there are too many exceptions where those syllables are part of the stem.
Words containing any of the non-Esperanto letters "á", "é", "í", "ó", "q", "ú", "w", "x", or "y" are not stemmed, except that a suffix may follow a non-Esperanto word if separated by a hyphen. Stems containing digits may also have suffixes, with or without hyphens.
The x-system sequences "cx", "gx", "hx", "jx", "sx", and "ux" are canonicalized to "ĉ", "ĝ", "ĥ", "ĵ", "ŝ", and "ŭ". The h-system is too ambiguous to support.
Inflections of "'sti" are expanded into forms of "esti". The words "l'" and "un'" become "la" and "unu". A final apostrophe becomes "aŭ" after certain known stems, or else "o".
One-syllable words are not stemmed. Each vowel forms a separate syllable. Words containing digits are always stemmed because numeral words are inflectable. Words containing non-initial hyphens are always stemmed because they might be abbreviations with multiple syllables.
Pronouns, correlatives, and the numeral "unu" have limited inflections. These
are handled in the pronoun
, correlative
, and
ujn_suffix
routines. Derived forms like "ilia" and "kieo" follow
the regular stemming rules.
Multimorphemic vowel-final numerals like "centunu" and "kvardekdu" are not stemmed. These are misspellings (they should be multiple words, like "cent unu" and "kvardek du") but they are unambiguous. Merged numerals are not supported for "mil" because that would create an ambiguity for "mildu", the imperative form of the adjective "milda".
Certain other multisyllabic words are invariant, including some interjections,
adverbs, and prepositions. These are listed explicitly in the
uninflected
routine.
booleans ( foreign )
routines (
canonical_form
correlative
final_apostrophe
initial_apostrophe
long_word
merged_numeral
not_after_letter
pronoun
standard_suffix
ujn_suffix
uninflected
)
externals ( stem )
groupings ( vowel aou digit )
define vowel 'aeiou'
define aou 'aou'
define digit '0123456789'
stringescapes {}
stringdef c^ '{U+0109}'
stringdef g^ '{U+011D}'
stringdef h^ '{U+0125}'
stringdef j^ '{U+0135}'
stringdef s^ '{U+015D}'
stringdef u+ '{U+016D}'
stringdef a' '{U+00E1}'
stringdef e' '{U+00E9}'
stringdef i' '{U+00ED}'
stringdef o' '{U+00F3}'
stringdef u' '{U+00FA}'
define canonical_form as (
unset foreign
repeat (
[substring]
among(
'cx' (<- '{c^}')
'gx' (<- '{g^}')
'hx' (<- '{h^}')
'jx' (<- '{j^}')
'sx' (<- '{s^}')
'ux' (<- '{u+}')
'{a'}' (<- 'a' set foreign)
'{e'}' (<- 'e' set foreign)
'{i'}' (<- 'i' set foreign)
'{o'}' (<- 'o' set foreign)
'{u'}' (<- 'u' set foreign)
'q' 'w' 'x' 'y' (set foreign)
'-' (unset foreign)
'' (next)
)
)
not foreign
)
define initial_apostrophe as (
['{'}'] 'st' among('as' 'i' 'is' 'os' 'u' 'us') atlimit <- 'e'
)
backwardmode (
define pronoun as (
[try 'n']
among(
'ci' 'gi' '{g^}i' 'hi' 'ili' 'i{s^}i' 'ivi' 'li' 'mal{s^}i' 'mi' 'ni'
'oni' 'ri' 'si' '{s^}i' '{s^}li' 'vi'
)
(atlimit or '-') delete
)
define final_apostrophe as (
['{'}']
('l' atlimit <- 'a') or
('un' atlimit <- 'u') or
(
among(
'adi' 'almen' 'amb' 'ank' 'ankor' 'anstat' 'anta{u+}hier' 'apen'
'bald' '{c^}irk' 'hier' 'hodi' 'kontr' 'kvaz' 'malbald' 'malgr'
'morg' 'postmorg' 'presk' 'tut{c^}irk'
) (atlimit or '-') <- 'a{u+}'
) or
(<- 'o')
)
define ujn_suffix as (
[try 'n' try 'j'] among('aliu' 'unu') (atlimit or '-') delete
)
define uninflected as (
among(
'aha' 'amen' 'dirlididi' 'disde' 'ehe' 'ekde' 'elde' 'haha'
'haleluja' 'hola' 'hosana' 'hura' '{h^}a{h^}a' 'mal{c^}i' 'malkaj'
'malpli' 'maltra' 'maltre' 'maltro' 'minus' 'muu' 'oho' 'tamen'
'uhu'
)
(atlimit or '-')
)
define merged_numeral as (
among('du' 'tri' 'unu')
among('cent' 'dek')
)
define correlative as (
[]
// Ignore -al, -am, etc. since they can't be confused with suffixes.
test (
((try 'n'] 'e') or (try 'n' try 'j'] aou))
'i'
try among('{c^}' 'k' 'kelk' 'mult' 'nen' 'samt' 't')
(atlimit or '-')
)
delete
)
define long_word as (
loop 2 gopast vowel or (gopast '-' next) or gopast digit
)
define not_after_letter as ('-' or digit)
define standard_suffix as (
[substring try '-']
among(
'a' 'aj' 'ajn' 'an'
'e' 'en'
'i' 'as' 'is' 'os' 'u' 'us'
'o' 'oj' 'ojn' 'on'
'j' not_after_letter
'jn' not_after_letter
'n' not_after_letter
)
delete
)
)
define stem as (
test canonical_form
do initial_apostrophe
backwards (
not pronoun
do final_apostrophe
not correlative
not uninflected
not merged_numeral
not ujn_suffix
test long_word
standard_suffix
)
)