Here is a sample of Estonian vocabulary, with the stemmed forms that will be generated by this algorithm:
word | stem | word | stem | |||||||||
raamat raamatu raamatut raamatule raamatud raamatute raamatuid raamatutele raamatutestki hele heleda heledat heledale heledad heledate heledaid heledatele heledam heledama heledamat heledamad heledamate heledamaid heledamatelegi heledaim heledaima heledaimat heledaimale heledaimad heledaimate heledaimaid heledaimatelt hobune hobuse hobust hobusele hobused hobuste hobuseid hobustele |
⇒ |
raama raama raama raama raama raama raama raama raama hele hele heleda heleda heleda heleda heleda heleda heleda heleda heleda heleda heleda heleda heleda heleda heleda heleda heleda heleda heleda heleda heleda hobune hobuse hobu hobuse hobuse hobus hobuse hobus |
hüpata hüppa hüppaksin hüppaksid hüppaks hüppaksime hüppaksite hüppan hüppad hüppab hüppame hüppate hüppavad hüppasin hüppasid hüppas hüppasime hüppasite hüpanuksite hüpatakse hüpati hüpanud hüpanutest hüpates hüppavat hüppavatele hüppamata hüppamast hüljes hülge hüljest hülgesse hüljeste hülgeid hüljestesse hülgeisse ohutule ohutud ohutuid ohututele |
⇒ |
hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpa hüpati hüpa hüpa hüpa hüpa hüpa hüpa hüpa hülje hülge hülje hülge hüljes hülge hüljes hülge ohu ohu ohu ohu |
This algorithm is written in collaboration with Estonian text analytics enterprise Texta.
Letters in Estonian include the following accented forms,
The following letters are vowels (V1):
RV is defined as one of the following:
KI is defined as one of the following (letters possible before -ki emphasis):
GI is defined as one of the following (letters possible before -gi emphasis):
R1 in this algorithm is defined as a region after the first consonant preceded by a vowel (laul[nud], mõt[teid], kar[tuleid], saab[as]). If there’s no such region, then R1 is empty (laul[Ø], saun[Ø]). Limitations in steps (such as "if preceded by RV") are not restricted to the R1 region.
LONGV is defined as one of the following:
Do step 0. If nothing was changed in step 0, continue with the steps, otherwise stop. Do step 1 and step 2. If nothing was changed in step 2, do steps 3, 4, 5, 6, 7, 8 and 9. If something was changed in step 2, do step 9.
Step 0: verb_exceptions
Step 1: emphasis
Step 2: verb
Step 3: special_noun_endings
Step 4: case_ending
Step 5: plural_three_first_cases
Step 6: degrees
Step 7: i_plural
Step 8: nu
Step 9: undouble_kpt
/* Estonian stemmer version 1.3
Made by Linda Freienthal in January 2019.
*/
routines (
mark_regions
LONGV
special_noun_endings
case_ending
emphasis
plural_three_first_cases
undouble_kpt
i_plural
degrees
substantive
verb_exceptions
verb
nu
)
stringescapes {}
stringdef a" '{U+00E4}' //a-umlaut ä
stringdef o" '{U+00F6}' //o-umlaut ö
stringdef o~ '{U+00F5}' //o with tilde õ
stringdef u" '{U+00FC}' //u-umlaut ü
stringdef sv '{U+0161}' //s-caron š
stringdef zv '{U+017E}' //z-caron ž
externals ( stem )
integers ( p1 )
groupings ( V1 RV KI GI)
define V1 'aeiou{o~}{a"}{o"}{u"}'
define RV 'aeiuo'
define KI 'kptgbdshf{sv}z{zv}'
define GI 'cjlmnqrvwxaeiou{o~}{a"}{o"}{u"}'
define mark_regions as (
$p1 = limit
goto V1 gopast non-V1 setmark p1
)
backwardmode (
define emphasis as (
setlimit tomark p1 for ([substring])
test hop 4 //kingi -> kingi
among(
'gi' ((GI and not LONGV) delete) //jookse-me-gi, bioloogi -> bioloogi
'ki' (KI delete) //kookki -> kook
)
)
// Signals t is a replacement was made; f otherwise.
define verb as (
setlimit tomark p1 for ([substring])
among(
'nuksin' 'nuksime' 'nuksid' 'nuksite' (delete) //seleta-nuksite
'ksin' 'ksid' 'ksime' 'ksite' (delete) //personal conditional: rõõmusta-ksin
'mata' (delete)
'takse' 'dakse' (delete) //impersonal: laul-dakse, luba-takse
'taks' 'daks' (delete) //impersonal conditional: laul-daks, saade-taks
'akse' (<-'a') //impersonal: tulla-kse, süüa-kse (-> söö), teha-kse (-> tegi), püüta-kse, leita-kse
'sime' (delete) //pl1pst: saat-sime
'site' (delete) //pl2pst: saat-site
'sin' (delete) //sg1pst: laul-sin, saat-sin
'me' (V1 delete) //pl1prs: laula-me, tule-me
'da' (V1 delete) //da-infinitive: luba-da
'n' (V1 delete) //sg1prs: kirjuta-n
'b' (V1 delete) //sg3prs: laula-b
)
)
define LONGV as
among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}' '{u"}{u"}' '{o~}{o~}')
define i_plural as (
setlimit tomark p1 for ([substring])
among(
'i' (RV) //raama-tu-i, lapsiku-i
)
delete
)
define special_noun_endings as (
setlimit tomark p1 for ([substring])
among(
'lasse' (<- 'lase') //teadlasse -> teadlase
'last' (<- 'lase') //teadlast -> teadlase
'lane' (<- 'lase') //teadlane -> teadlase
'lasi'(<- 'lase') //teadlasi -> teadlase
'misse' (<- 'mise') //tegemisse -> tegemise
'mist' (<- 'mise') //kasutamist -> kasutamise
'mine' (<- 'mise') //tegemine -> tegemise
'misi' (<- 'mise') //kasutamisi -> kasutamise
'lisse' (<- 'lise') //rohelisse -> rohelise
'list' (<- 'lise') //tavalist -> tavalise
'line' (<- 'lise') //roheline -> rohelise
'lisi' (<- 'lise') //tavalisi -> tavalise
)
)
define case_ending as (
setlimit tomark p1 for ([substring])
among(
'sse' (RV or LONGV) //illative: saapa-sse
'st' (RV or LONGV) //elative: saapa-st and kapsas-t
'le' (RV or LONGV) //allative: raama-tu-le
'lt' (RV or LONGV) //ablative: raama-tu-lt
'ga' (RV or LONGV) //komitatiive: õpetaja-ga
'ks' (RV or LONGV) //translative: õpetaja-ks
'ta' (RV or LONGV) //abessive and da-infinitive: õpetaja-ta and hüpa-ta
't' (test hop 4) //partitiiv, raamatu-t
's' (RV or LONGV) //inessive and sg3pst: raama-tu-s and sõiti-s
'l' (RV or LONGV) //adessive: raama-tu-l and kapsa-l.
)
delete
)
define plural_three_first_cases as (
setlimit tomark p1 for ([substring])
among(
'ikkude' (<-'iku') //plural genitive: õnnelikkude -> õnneliku
'ikke' (<-'iku') //plural partitive: rahulikke -> rahuliku
'ike' (<-'iku') //plural genitive: ohtlike -> ohtliku
'sid' (not LONGV delete) //plural partitive and sg2pst and pl3pst: auto-sid and laul-sid (exludes plural nominative with words like gaasid, roosid)
// plural genitive and pl2: ministri-te, oluliste -> olulise and saada-te, laula-te;
// also torte -> tort (if not in compound word) and kokkuvõtte -> kokkuvõte and roheliste -> rohelise, tegemiste -> tegemise, teadlaste -> teadlase
'te' (
(test hop 4
among (
'mis' 'las' 'lis' (<- 'e')
't' ()
'' (delete)
)
) or <- 't'
)
'de' ((RV or LONGV) delete) //plural genitive: lauda-de
'd' ((RV or LONGV) delete) //plural nominative: voodi-d, rattai-d (rata), lapsiku-i-d
)
)
define nu as (
setlimit tomark p1 for ([substring])
among(
'nu' //haka-nu(-te-ga)
'tu' //luba-tu(-d)
'du' //laul-du(-te-st)
'va' //laul-va(-te-le)
)
delete
)
define undouble_kpt as (
// undouble '-C1C1V' where C1 is k, p or t:
// mõtte(-le) -> mõte, hakka(-n) -> haka
//
// We only undouble if the vowel is in R1 to avoid modifying short
// non-words (mostly to avoid modifying acronyms/initialisms which such
// as "PPE").
V1 $(p1 <= cursor)
[substring] among(
'kk' (<- 'k')
'pp' (<- 'p')
'tt' (<- 't')
)
)
define degrees as (
setlimit tomark p1 for ([substring])
among(
'mai' (RV delete) //heleda-mai(-le)
'ma' (delete) //tuge-va-ma(-le) and ma-infinitive: sõit-ma
'm' (RV delete) //kauge-i-m, rõõmsa-m
)
)
define substantive as (
do special_noun_endings
do case_ending
do plural_three_first_cases
do degrees
do i_plural
do nu
)
)
define verb_exceptions as (
[substring] atlimit
among(
'joon' 'jood' 'joob' 'joote' 'joome' 'joovad' (<-'joo')
'j{o~}in' 'j{o~}id' 'j{o~}i' 'j{o~}ime' 'j{o~}ite' (<-'joo')
'joomata' 'juuakse' 'joodakse' 'juua' 'jooma' (<- 'joo')
'saan' 'saad' 'saab' 'saate' 'saame' 'saavad' (<-'saa')
'saaksin' 'saaksid' 'saaks' 'saaksite' 'saaksime' (<-'saa')
'sain' 'said' 'sai' 'saite' 'saime' (<-'saa')
'saamata' 'saadakse' 'saadi' 'saama' 'saada' (<-'saa')
'viin' 'viid' 'viib' 'viite' 'viime' 'viivad' (<-'viima')
'viiksin' 'viiksid' 'viiks' 'viiksite' 'viiksime' (<-'viima')
'viisin' 'viisite' 'viisime' (<-'viima')
'viimata' 'viiakse' 'viidi' 'viima' 'viia' (<-'viima')
'keen' 'keeb' 'keed' 'kees' 'keeme' 'keete' 'keevad' (<-'keesi')
'keeksin' 'keeks' 'keeksid' 'keeksime' 'keeksite' (<-'keesi')
'keemata' 'keema' 'keeta' 'keedakse' (<-'keesi')
'l{o"}{o"}n' 'l{o"}{o"}d' 'l{o"}{o"}b' 'l{o"}{o"}me' 'l{o"}{o"}te' 'l{o"}{o"}vad' (<-'l{o"}{o"}')
'l{o"}{o"}ksin' 'l{o"}{o"}ksid' 'l{o"}{o"}ks' 'l{o"}{o"}ksime' 'l{o"}{o"}ksite' (<-'l{o"}{o"}')
'l{o"}{o"}mata' 'l{u"}{u"}akse' 'l{o"}{o"}dakse' 'l{o"}{o"}di' 'l{o"}{o"}ma' 'l{u"}{u"}a' (<-'l{o"}{o"}')
// Both looma and lööma have these same past tense forms
'l{o~}in' 'l{o~}id' 'l{o~}i' 'l{o~}ime' 'l{o~}ite' (<-'l{o~}i')
'loon' 'lood' 'loob' 'loome' 'loote' 'loovad' (<-'loo')
'looksin' 'looksid' 'looks' 'looksime' 'looksite' (<-'loo')
'loomata' 'luuakse' 'loodi' 'luua' 'looma' (<-'loo')
'k{a"}in' 'k{a"}ib' 'k{a"}id' 'k{a"}is' 'k{a"}ime' 'k{a"}ite' 'k{a"}ivad' (<-'k{a"}isi')
'k{a"}iksin' 'k{a"}iks' 'k{a"}iksid' 'k{a"}iksime' 'k{a"}iksite' (<-'k{a"}isi')
'k{a"}imata' 'k{a"}iakse' 'k{a"}idi' 'k{a"}ia' 'k{a"}ima' (<-'k{a"}isi')
's{o"}{o"}n' 's{o"}{o"}b' 's{o"}{o"}d' 's{o"}{o"}me' 's{o"}{o"}te' 's{o"}{o"}vad' (<-'s{o"}{o"}')
's{o"}{o"}ksin' 's{o"}{o"}ks' 's{o"}{o"}ksid' 's{o"}{o"}ksime' 's{o"}{o"}ksite' (<-'s{o"}{o"}')
's{o~}in' 's{o~}i' 's{o~}id' 's{o~}ime' 's{o~}ite' (<-'s{o"}{o"}')
's{o"}{o"}mata' 's{u"}{u"}akse' 's{o"}{o"}dakse' 's{o"}{o"}di' 's{o"}{o"}ma' 's{u"}{u"}a' (<-'s{o"}{o"}')
'toon' 'tood' 'toob' 'toote' 'toome' 'toovad' (<-'too')
'tooksin' 'tooksid' 'tooks' 'tooksite' 'tooksime' (<-'too')
't{o~}in' 't{o~}id' 't{o~}i' 't{o~}ime' 't{o~}ite' (<-'too')
'toomata' 'tuuakse' 'toodi' 'tooma' 'tuua' (<-'too')
'v{o~}in' 'v{o~}id' 'v{o~}ib' 'v{o~}ime' 'v{o~}is' 'v{o~}ite' 'v{o~}ivad' (<-'v{o~}isi')
'v{o~}iksin' 'v{o~}iksid' 'v{o~}iks' 'v{o~}iksime' 'v{o~}iksite' (<-'v{o~}isi')
'v{o~}imata' 'v{o~}idakse' 'v{o~}idi' 'v{o~}ida' 'v{o~}ima' (<-'v{o~}isi')
'j{a"}{a"}n' 'j{a"}{a"}d' 'j{a"}{a"}b' 'j{a"}{a"}me' 'j{a"}{a"}te' 'j{a"}{a"}vad' (<-'j{a"}{a"}ma')
'j{a"}{a"}ksin' 'j{a"}{a"}ksid' 'j{a"}{a"}ks' 'j{a"}{a"}ksime' 'j{a"}{a"}ksite' (<-'j{a"}{a"}ma')
'j{a"}ime' 'j{a"}ite' 'j{a"}in' 'j{a"}id' 'j{a"}i' (<-'j{a"}{a"}ma')
'j{a"}{a"}mata' 'j{a"}{a"}dakse' 'j{a"}{a"}da' 'j{a"}{a"}ma' 'j{a"}{a"}di' (<-'j{a"}{a"}ma')
'm{u"}{u"}n' 'm{u"}{u"}d' 'm{u"}{u"}b' 'm{u"}{u"}s' 'm{u"}{u"}me' 'm{u"}{u"}te' 'm{u"}{u"}vad' (<-'m{u"}{u"}si')
'm{u"}{u"}ksin' 'm{u"}{u"}ksid' 'm{u"}{u"}ks' 'm{u"}{u"}ksime' 'm{u"}{u"}ksite' (<-'m{u"}{u"}si')
'm{u"}{u"}mata' 'm{u"}{u"}akse' 'm{u"}{u"}di' 'm{u"}{u"}a' 'm{u"}{u"}ma' (<-'m{u"}{u"}si')
'loeb' 'loen' 'loed' 'loeme' 'loete' 'loevad' (<- 'luge')
'loeks' 'loeksin' 'loeksid' 'loeksime' 'loeksite' (<- 'luge')
'p{o~}en' 'p{o~}eb' 'p{o~}ed' 'p{o~}eme' 'p{o~}ete' 'p{o~}evad' (<- 'p{o~}de')
'p{o~}eksin' 'p{o~}eks' 'p{o~}eksid' 'p{o~}eksime' 'p{o~}eksite' (<- 'p{o~}de')
'laon' 'laob' 'laod' 'laome' 'laote' 'laovad' (<- 'ladu')
'laoksin' 'laoks' 'laoksid' 'laoksime' 'laoksite' (<- 'ladu')
'teeksin' 'teeks' 'teeksid' 'teeksime' 'teeksite' (<- 'tegi')
'teen' 'teeb' 'teed' 'teeme' 'teete' 'teevad' (<- 'tegi')
'tegemata' 'tehakse' 'tehti' 'tegema' 'teha' (<-'tegi')
'n{a"}en' 'n{a"}eb' 'n{a"}ed' 'n{a"}eme' 'n{a"}ete' 'n{a"}evad' (<-'n{a"}gi')
'n{a"}eksin' 'n{a"}eks' 'n{a"}eksid' 'n{a"}eksime' 'n{a"}eksite' (<-'n{a"}gi')
'n{a"}gemata' 'n{a"}hakse' 'n{a"}hti' 'n{a"}ha' 'n{a"}gema' (<-'n{a"}gi')
)
)
define stem as (
not verb_exceptions
// p1 isn't used by verb_exceptions
do mark_regions
backwards (
do emphasis
do ( verb or substantive )
do undouble_kpt
)
)