Czech stemming algorithm

Links to resources

In March 2012 Jim O’Regan sent us an implementation of Ljiljana Dolamic's Czech stemmer.

routines (
  RV R1
  palatalise
  mark_regions
  do_possessive
  do_case
  do_comparative
  do_diminutive
  do_augmentative
  do_derivational
  do_deriv_single
  do_aggressive
)

externals ( stem )

integers ( pV p1 )

groupings ( v )

stringescapes {}

stringdef a' '{U+00E1}'
stringdef c^ '{U+010D}'
stringdef d^ '{U+010F}'
stringdef e' '{U+00E9}'
stringdef e^ '{U+011B}'
stringdef i' '{U+00ED}'
stringdef n^ '{U+0148}'
stringdef o' '{U+00F3}'
stringdef r^ '{U+0159}'
stringdef s^ '{U+0161}'
stringdef t^ '{U+0165}'
stringdef u' '{U+00FA}'
stringdef u* '{U+016F}'
stringdef y' '{U+00FD}'
stringdef z^ '{U+017E}'

define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}'

define mark_regions as (

    $pV = limit
    $p1 = limit

    do (
        gopast non-v setmark pV
        gopast non-v gopast v setmark p1
    )
)

backwardmode (

  define RV as $pV <= cursor
  define R1 as $p1 <= cursor

  define palatalise as (
    [substring] RV among (
      'ci' 'ce' '{c^}i' '{c^}'
      (<- 'k')
      'zi' 'ze' '{z^}i' '{z^}e'
      (<- 'h')
      '{c^}t{e^}' '{c^}ti' '{c^}t{e'}'
      (<- 'ck')
      '{s^}t{e^}' '{s^}ti' '{s^}t{e'}'
      (<- 'sk')
    )
  )

  define do_possessive as (
    [substring] RV among (
      'ov' '{u*}v'
      (delete)
      'in'
      (
        delete
        try palatalise
      )
    )
  )

  define do_case as (
    [substring] among (
      'atech'
      '{e^}tem' 'at{u*}m'
      '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi'
      'ata' 'aty' 'ama' 'ami' 'ovi'
      'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou'
      'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}'
      (delete)
      'ech' 'ich' '{i'}ch'
      '{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi'
      'emi' 'iho' 'imu'
      '{e'}m' '{i'}m' 'es'
      'e' 'i' '{i'}' '{e^}'
      (
        delete
        try palatalise
      )
      'em'
      (
        <- 'e'
        try palatalise
      )
    )
  )

  define do_derivational as (
    [substring] R1 among (
      'obinec'
      'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k'
      '{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin'
      '{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k'
      'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv'
      '{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as'
      'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn'
      '{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk'
      (delete)
      'ion{a'}{r^}'
      'inec' 'itel'
      'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb'
      'ic' 'in' 'it' 'iv'
      (
        <- 'i'
        palatalise
      )
      'enic' 'ec' 'en'
      (
        <- 'e'
        palatalise
      )
      '{e'}{r^}'
      (
        <- '{e'}'
        palatalise
      )
      '{e^}n'
      (
        <- '{e^}'
        palatalise
      )
      '{i'}rn'
      '{i'}{r^}' '{i'}n'
      (
        <- '{i'}'
        palatalise
      )
    )
  )
  define do_deriv_single as (
    [substring] among (
      'c' '{c^}' 'k' 'l' 'n' 't'
      (delete)
    )
  )


  define do_augmentative as (
    [substring] among (
      'ajzn' '{a'}k'
      (delete)
      'izn' 'isk'
      (
        <- 'i'
        palatalise
      )
    )
  )

  define do_diminutive as (
    [substring] among (
      'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek'
      'anek' 'onek' 'unek' '{a'}nek'
      'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk'
      '{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk'
      '{a'}tk' '{a'}nk' 'u{s^}k'
      'k'
      (delete)
      'e{c^}ek' 'enek' 'ek'
      (
        <- 'e'
        palatalise
      )
      '{e'}{c^}ek' '{e'}k'
      (
        <- '{e'}'
        palatalise
      )
      'i{c^}ek' 'inek' 'ik'
      (
        <- 'i'
        palatalise
      )
      '{i'}{c^}ek' '{i'}k'
      (
        <- '{i'}'
        palatalise
      )
      '{a'}k'
       (<- '{a'}')
      'ak'
       (<- 'a')
      'ok'
       (<- 'o')
      'uk'
       (<- 'u')
    )
  )

  define do_comparative as (
    [substring] among (
      '{e^}j{s^}'
      (
        <- '{e^}'
        palatalise
      )
      'ej{s^}'
      (
        <- 'e'
        palatalise
      )
    )
  )

  define do_aggressive as (
    do do_comparative
    do do_diminutive
    do do_augmentative
    do_derivational or do_deriv_single
  )
)

define stem as (
  do mark_regions
  backwards (
    do_case
    do_possessive
    // light and aggressive are the same to this point
    // comment next line for light stemmer
    do_aggressive
  )
)

// Ljiljana Dolamic and Jacques Savoy. 2009.
// Indexing and stemming approaches for the Czech language.
// Inf. Process. Manage. 45, 6 (November 2009), 714-720.
// http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt
// http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt