Here is a sample of Irish vocabulary, with the stemmed forms that will be generated by this algorithm:
word | stem | word | stem | |||||||||
ábharú ábhbar abhcach abhchóide abhcóid abhcóide abhcóideacht abhcóidí abhcóidíocht abhcóidíochta abheadh ábhéile abheimídne abhfad abhfos abhfus abhhaile abhí abhlach abhlainn abhlainne abhlainneach abhlaireacht abhlann abhlóir abhlóird abhlóirí abhlóra abhna abhóg |
⇒ |
ábharú ábhbar abhcach abhchóide abhcóid abhcóide abhcóid abhcóidí abhcóid abhcóid abh ábhéile abheimídne abhfad abhfos abhfus abhhaile abhí abhlach abhlainn abhlainne abhlainn abhlair abhlann abhlóir abhlóird abhlóirí abhlóra abhna abhóg |
pábháil pábhaile pábhailí pábhaillí pábháilte pábhála pábhálaithe pabhar pabhsae pabhsaeir pabhsaer pabhsaetha paca páca pacaeirí pacaí pacáil pacáilte pacáiltear pacaire pacaireachta pacáiste pácáiste pacaistí pacáistí pacáistín pacáistíocht pacáistíochta pacáistítear pacáistithe |
⇒ |
pábh pábhaile pábhailí pábhaillí pábháilte pábhála pábhálaithe pabhar pabhsae pabhsaeir pabhsaer pabhsaetha paca páca pacaeirí pacaí pac pacáilte pacáil pacaire pacair pacáiste pácáiste pacaistí pacáistí pacáistín pacáist pacáist pacáistí pacáistithe |
This basic stemmer for Irish was developed and contributed by Jim O’Regan.
One thing that should be taken into account with Irish is the initial mutation (n-eclipsis and h-prothesis) which causes problems if words are simply folded to lowercase before stemming in the way that is usually assumed by Snowball stemmers. A Snowball version of an algorithm to fold to lowercase while taking this into account would look something like:
stringescapes {}
stringdef A' '{U+00C1}'
stringdef E' '{U+00C9}'
stringdef I' '{U+00CD}'
stringdef O' '{U+00D3}'
stringdef U' '{U+00DA}'
stringdef a' '{U+00E1}'
stringdef e' '{U+00E9}'
stringdef i' '{U+00ED}'
stringdef o' '{U+00F3}'
stringdef u' '{U+00FA}'
define tolower_irish as (
[substring] among (
'nA' (<- 'n-a')
'nE' (<- 'n-e')
'nI' (<- 'n-i')
'nO' (<- 'n-o')
'nU' (<- 'n-u')
'n{A'}' (<- 'n-{a'}')
'n{E'}' (<- 'n-{e'}')
'n{I'}' (<- 'n-{i'}')
'n{O'}' (<- 'n-{o'}')
'n{U'}' (<- 'n-{u'}')
'tA' (<- 't-a')
'tE' (<- 't-e')
'tI' (<- 't-i')
'tO' (<- 't-o')
'tU' (<- 't-u')
't{A'}' (<- 't-{a'}')
't{E'}' (<- 't-{e'}')
't{I'}' (<- 't-{i'}')
't{O'}' (<- 't-{o'}')
't{U'}' (<- 't-{u'}')
)
)
The following characters are vowels for the purposes of this algorithm:
The algorithm first addresses the initial mutation, then regions are determined based on the word after this first step:
routines (
R1 R2 RV
initial_morph
mark_regions
noun_sfx
deriv
verb_sfx
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v )
stringescapes {}
/* Accented characters */
stringdef a' '{U+00E1}' // a-acute
stringdef e' '{U+00E9}' // e-acute
stringdef i' '{U+00ED}' // i-acute
stringdef o' '{U+00F3}' // o-acute
stringdef u' '{U+00FA}' // u-acute
define v 'aeiou{a'}{e'}{i'}{o'}{u'}'
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
gopast v setmark pV
gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define initial_morph as (
[substring] among (
'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic
(delete)
// verbs
'd{'}'
(delete)
'd{'}fh'
(<- 'f')
// other contractions
'm{'}' 'b{'}'
(delete)
'sh'
(<- 's')
'mb'
(<- 'b')
'gc'
(<- 'c')
'nd'
(<- 'd')
'bhf'
(<- 'f')
'ng'
(<- 'g')
'bp'
(<- 'p')
'ts'
(<- 's')
'dt'
(<- 't')
// Lenition
'bh'
(<- 'b')
'ch'
(<- 'c')
'dh'
(<- 'd')
'fh'
(<- 'f')
'gh'
(<- 'g')
'mh'
(<- 'm')
'ph'
(<- 'p')
'th'
(<- 't')
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define noun_sfx as (
[substring] among (
'amh' 'eamh' 'abh' 'eabh'
'aibh' 'ibh' 'aimh' 'imh'
'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta'
(R1 delete)
'ire' 'ir{i'}' 'aire' 'air{i'}'
(R2 delete)
)
)
define deriv as (
[substring] among (
'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta'
(R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl
'arcacht' 'arcachta{i'}' 'arcachta'
(<- 'arc') // monarcacht -> monarc
'gineach' 'gineas' 'ginis'
(<- 'gin')
'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}'
(<- 'graf')
'paite' 'patach' 'pataigh' 'patacha'
(<- 'paite')
'{o'}ideach' '{o'}ideacha' '{o'}idigh'
(<- '{o'}id')
)
)
define verb_sfx as (
[substring] among (
'imid' 'aimid' '{i'}mid' 'a{i'}mid'
'faidh' 'fidh'
(RV delete)
'ain'
'eadh' 'adh'
'{a'}il'
'tear' 'tar'
(R1 delete)
)
)
)
define stem as (
do initial_morph
do mark_regions
backwards (
do noun_sfx
do deriv
do verb_sfx
)
)