Irish Gaelic stemming algorithm

Links to resources

Here is a sample of Irish vocabulary, with the stemmed forms that will be generated by this algorithm:

word stem          word stem
ábharú
ábhbar
abhcach
abhchóide
abhcóid
abhcóide
abhcóideacht
abhcóidí
abhcóidíocht
abhcóidíochta
abheadh
ábhéile
abheimídne
abhfad
abhfos
abhfus
abhhaile
abhí
abhlach
abhlainn
abhlainne
abhlainneach
abhlaireacht
abhlann
abhlóir
abhlóird
abhlóirí
abhlóra
abhna
abhóg
ábharú
ábhbar
abhcach
abhchóide
abhcóid
abhcóide
abhcóid
abhcóidí
abhcóid
abhcóid
abh
ábhéile
abheimídne
abhfad
abhfos
abhfus
abhhaile
abhí
abhlach
abhlainn
abhlainne
abhlainn
abhlair
abhlann
abhlóir
abhlóird
abhlóirí
abhlóra
abhna
abhóg
pábháil
pábhaile
pábhailí
pábhaillí
pábháilte
pábhála
pábhálaithe
pabhar
pabhsae
pabhsaeir
pabhsaer
pabhsaetha
paca
páca
pacaeirí
pacaí
pacáil
pacáilte
pacáiltear
pacaire
pacaireachta
pacáiste
pácáiste
pacaistí
pacáistí
pacáistín
pacáistíocht
pacáistíochta
pacáistítear
pacáistithe
pábh
pábhaile
pábhailí
pábhaillí
pábháilte
pábhála
pábhálaithe
pabhar
pabhsae
pabhsaeir
pabhsaer
pabhsaetha
paca
páca
pacaeirí
pacaí
pac
pacáilte
pacáil
pacaire
pacair
pacáiste
pácáiste
pacaistí
pacáistí
pacáistín
pacáist
pacáist
pacáistí
pacáistithe

This basic stemmer for Irish was developed and contributed by Jim O’Regan.

One thing that should be taken into account with Irish is the initial mutation (n-eclipsis and h-prothesis) which causes problems if words are simply folded to lowercase before stemming in the way that is usually assumed by Snowball stemmers. A Snowball version of an algorithm to fold to lowercase while taking this into account would look something like:

stringescapes {}

stringdef A'   '{U+00C1}'
stringdef E'   '{U+00C9}'
stringdef I'   '{U+00CD}'
stringdef O'   '{U+00D3}'
stringdef U'   '{U+00DA}'
stringdef a'   '{U+00E1}'
stringdef e'   '{U+00E9}'
stringdef i'   '{U+00ED}'
stringdef o'   '{U+00F3}'
stringdef u'   '{U+00FA}'

define tolower_irish as (
 [substring] among (
   'nA' (<- 'n-a')
   'nE' (<- 'n-e')
   'nI' (<- 'n-i')
   'nO' (<- 'n-o')
   'nU' (<- 'n-u')
   'n{A'}' (<- 'n-{a'}')
   'n{E'}' (<- 'n-{e'}')
   'n{I'}' (<- 'n-{i'}')
   'n{O'}' (<- 'n-{o'}')
   'n{U'}' (<- 'n-{u'}')

   'tA' (<- 't-a')
   'tE' (<- 't-e')
   'tI' (<- 't-i')
   'tO' (<- 't-o')
   'tU' (<- 't-u')
   't{A'}' (<- 't-{a'}')
   't{E'}' (<- 't-{e'}')
   't{I'}' (<- 't-{i'}')
   't{O'}' (<- 't-{o'}')
   't{U'}' (<- 't-{u'}')
 )
)

The following characters are vowels for the purposes of this algorithm:

a e i o u á é í ó ú

The algorithm first addresses the initial mutation, then regions are determined based on the word after this first step:

  • RV is the region after the first vowel, or the end of the word if it contains no vowels.
  • R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
  • R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel.

The full algorithm in Snowball

routines (
  R1 R2 RV
  initial_morph
  mark_regions
  noun_sfx
  deriv
  verb_sfx
)

externals ( stem )

integers ( pV p1 p2 )

groupings ( v )

stringescapes {}

/* Accented characters */

stringdef a'   '{U+00E1}'  // a-acute
stringdef e'   '{U+00E9}'  // e-acute
stringdef i'   '{U+00ED}'  // i-acute
stringdef o'   '{U+00F3}'  // o-acute
stringdef u'   '{U+00FA}'  // u-acute

define v 'aeiou{a'}{e'}{i'}{o'}{u'}'

define mark_regions as (

    $pV = limit
    $p1 = limit
    $p2 = limit  // defaults

    do (
        gopast v setmark pV
        gopast non-v setmark p1
        gopast v gopast non-v setmark p2
    )
)

define initial_morph as (
  [substring] among (
    'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic
    (delete)

    // verbs
    'd{'}'
    (delete)
    'd{'}fh'
    (<- 'f')
    // other contractions
    'm{'}' 'b{'}'
    (delete)

    'sh'
    (<- 's')

    'mb'
    (<- 'b')
    'gc'
    (<- 'c')
    'nd'
    (<- 'd')
    'bhf'
    (<- 'f')
    'ng'
    (<- 'g')
    'bp'
    (<- 'p')
    'ts'
    (<- 's')
    'dt'
    (<- 't')

    // Lenition
    'bh'
    (<- 'b')
    'ch'
    (<- 'c')
    'dh'
    (<- 'd')
    'fh'
    (<- 'f')
    'gh'
    (<- 'g')
    'mh'
    (<- 'm')
    'ph'
    (<- 'p')
    'th'
    (<- 't')
  )
)

backwardmode (

  define RV as $pV <= cursor
  define R1 as $p1 <= cursor
  define R2 as $p2 <= cursor

  define noun_sfx as (
    [substring] among (
      'amh' 'eamh' 'abh' 'eabh'
      'aibh' 'ibh' 'aimh' 'imh'
      'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta'
      (R1 delete)
      'ire' 'ir{i'}' 'aire' 'air{i'}'
      (R2 delete)
    )
  )
  define deriv as (
    [substring] among (
      'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta'
      (R2 delete)  //siopadóireacht -> siopadóir but not poblacht -> pobl
      'arcacht' 'arcachta{i'}' 'arcachta'
      (<- 'arc') // monarcacht -> monarc
      'gineach' 'gineas' 'ginis'
      (<- 'gin')
      'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}'
      (<- 'graf')
      'paite' 'patach' 'pataigh' 'patacha'
      (<- 'paite')
      '{o'}ideach' '{o'}ideacha' '{o'}idigh'
      (<- '{o'}id')
    )
  )
  define verb_sfx as (
    [substring] among (
      'imid' 'aimid' '{i'}mid' 'a{i'}mid'
      'faidh' 'fidh'
      (RV delete)
      'ain'
      'eadh' 'adh'
      '{a'}il'
      'tear' 'tar'
      (R1 delete)
    )
  )
)

define stem as (
  do initial_morph
  do mark_regions
  backwards (
    do noun_sfx
    do deriv
    do verb_sfx
  )
)