Esperanto stemming algorithm

Links to resources

Here is a sample of Esperanto vocabulary, with the stemmed forms that will be generated by this algorithm:

word

stem

word

stem

abajo
abako
abandonante
abandonas
abandoni
abandonis
abandonita
abandonitaj
abandono
abasida
abasidaj
abasidoj
abata
abate
abateja
abatejo

⇒

abaj
abak
abandonant
abandon
abandon
abandon
abandonit
abandonit
abandon
abasid
abasid
abasid
abat
abat
abatej
abatej

kvazaŭ
kvazaŭa
kvazaŭaj
kvazaŭan
kvazaŭarmea
kvazaŭdeciduaj
kvazaŭe
kvazaŭregulaj
kvazaŭĉiamverdaj
kvena
kvenja
kverado
kverelas
kvereli
kverelis
kverelo

⇒

kvazaŭ
kvazaŭ
kvazaŭ
kvazaŭ
kvazaŭarme
kvazaŭdecidu
kvazaŭ
kvazaŭregul
kvazaŭĉiamverd
kven
kvenj
kverad
kverel
kverel
kverel
kverel

The stemming algorithm

Letters in Esperanto include the following accented forms:

ĉ ĝ ĥ ĵ ŝ ŭ

The following letters are vowels:

a e i o u

The algorithm removes suffixes for part of speech ("-a", "-e", "-i", "-o"), verb inflection ("-as", "-is", "-os", "-u", "-us"), number ("-j"), and case ("-n"). That covers most words in Esperanto. The rest of the algorithm covers various exceptions and uncommon patterns. Unofficial words are supported when unambiguous.

The verbal suffixes "-ant", "-int", and "-ont" and prefixes "ek-" and "el-" are not removed, because there are too many exceptions where those syllables are part of the stem.

Words containing any of the non-Esperanto letters "á", "é", "í", "ó", "q", "ú", "w", "x", or "y" are not stemmed, except that a suffix may follow a non-Esperanto word if separated by a hyphen. Stems containing digits may also have suffixes, with or without hyphens.

The x-system sequences "cx", "gx", "hx", "jx", "sx", and "ux" are canonicalized to "ĉ", "ĝ", "ĥ", "ĵ", "ŝ", and "ŭ". The h-system is too ambiguous to support.

Inflections of "'sti" are expanded into forms of "esti". The words "l'" and "un'" become "la" and "unu". A final apostrophe becomes "aŭ" after certain known stems, or else "o".

One-syllable words are not stemmed. Each vowel forms a separate syllable. Words containing digits are always stemmed because numeral words are inflectable. Words containing non-initial hyphens are always stemmed because they might be abbreviations with multiple syllables.

Pronouns, correlatives, and the numeral "unu" have limited inflections. These are handled in the pronoun, correlative, and ujn_suffix routines. Derived forms like "ilia" and "kieo" follow the regular stemming rules.

Multimorphemic vowel-final numerals like "centunu" and "kvardekdu" are not stemmed. These are misspellings (they should be multiple words, like "cent unu" and "kvardek du") but they are unambiguous. Merged numerals are not supported for "mil" because that would create an ambiguity for "mildu", the imperative form of the adjective "milda".

Certain other multisyllabic words are invariant, including some interjections, adverbs, and prepositions. These are listed explicitly in the uninflected routine.

booleans ( foreign )

routines (
    canonical_form
    correlative
    final_apostrophe
    initial_apostrophe
    long_word
    merged_numeral
    not_after_letter
    pronoun
    standard_suffix
    ujn_suffix
    uninflected
)

externals ( stem )

groupings ( vowel aou digit )

define vowel 'aeiou'
define aou 'aou'
define digit '0123456789'

stringescapes {}

stringdef c^ '{U+0109}'
stringdef g^ '{U+011D}'
stringdef h^ '{U+0125}'
stringdef j^ '{U+0135}'
stringdef s^ '{U+015D}'
stringdef u+ '{U+016D}'

stringdef a' '{U+00E1}'
stringdef e' '{U+00E9}'
stringdef i' '{U+00ED}'
stringdef o' '{U+00F3}'
stringdef u' '{U+00FA}'

define canonical_form as (
    unset foreign
    repeat (
        [substring]
        among(
            'cx' (<- '{c^}')
            'gx' (<- '{g^}')
            'hx' (<- '{h^}')
            'jx' (<- '{j^}')
            'sx' (<- '{s^}')
            'ux' (<- '{u+}')
            '{a'}' (<- 'a' set foreign)
            '{e'}' (<- 'e' set foreign)
            '{i'}' (<- 'i' set foreign)
            '{o'}' (<- 'o' set foreign)
            '{u'}' (<- 'u' set foreign)
            'q' 'w' 'x' 'y' (set foreign)
            '-' (unset foreign)
            '' (next)
        )
    )
    not foreign
)

define initial_apostrophe as (
    ['{'}'] 'st' among('as' 'i' 'is' 'os' 'u' 'us') atlimit <- 'e'
)

backwardmode (
    define pronoun as (
        [try 'n']
        among(
            'ci' 'gi' '{g^}i' 'hi' 'ili' 'i{s^}i' 'ivi' 'li' 'mal{s^}i' 'mi' 'ni'
            'oni' 'ri' 'si' '{s^}i' '{s^}li' 'vi'
        )
        (atlimit or '-') delete
    )

    define final_apostrophe as (
        ['{'}']
        ('l' atlimit <- 'a') or
        ('un' atlimit <- 'u') or
        (
            among(
                'adi' 'almen' 'amb' 'ank' 'ankor' 'anstat' 'anta{u+}hier' 'apen'
                'bald' '{c^}irk' 'hier' 'hodi' 'kontr' 'kvaz' 'malbald' 'malgr'
                'morg' 'postmorg' 'presk' 'tut{c^}irk'
            ) (atlimit or '-') <- 'a{u+}'
        ) or
        (<- 'o')
    )

    define ujn_suffix as (
        [try 'n' try 'j'] among('aliu' 'unu') (atlimit or '-') delete
    )

    define uninflected as (
        among(
            'aha' 'amen' 'dirlididi' 'disde' 'ehe' 'ekde' 'elde' 'haha'
            'haleluja' 'hola' 'hosana' 'hura' '{h^}a{h^}a' 'mal{c^}i' 'malkaj'
            'malpli' 'maltra' 'maltre' 'maltro' 'minus' 'muu' 'oho' 'tamen'
            'uhu'
        )
        (atlimit or '-')
    )

    define merged_numeral as (
        among('du' 'tri' 'unu')
        among('cent' 'dek')
    )

    define correlative as (
        []
        // Ignore -al, -am, etc. since they can't be confused with suffixes.
        test (
            ((try 'n'] 'e') or (try 'n' try 'j'] aou))
            'i'
            try among('{c^}' 'k' 'kelk' 'mult' 'nen' 'samt' 't')
            (atlimit or '-')
        )
        delete
    )

    define long_word as (
        loop 2 gopast vowel or (gopast '-' next) or gopast digit
    )

    define not_after_letter as ('-' or digit)

    define standard_suffix as (
        [substring try '-']
        among(
            'a' 'aj' 'ajn' 'an'
            'e' 'en'
            'i' 'as' 'is' 'os' 'u' 'us'
            'o' 'oj' 'ojn' 'on'
            'j' not_after_letter
            'jn' not_after_letter
            'n' not_after_letter
        )
        delete
    )
)

define stem as (
    test canonical_form
    do initial_apostrophe
    backwards (
        not pronoun
        do final_apostrophe
        not correlative
        not uninflected
        not merged_numeral
        not ujn_suffix
        test long_word
        standard_suffix
    )
)