Czech stemming algorithm

In March 2012 Jim O’Regan sent us an implementation of Ljiljana Dolamic's Czech stemmer.

routines (
RV R1
palatalise
mark_regions
do_possessive
do_case
do_comparative
do_diminutive
do_augmentative
do_derivational
do_deriv_single
do_aggressive
)

externals ( stem )

integers ( pV p1 )

groupings ( v )

stringescapes {}

stringdef a' '{U+00E1}'
stringdef c^ '{U+010D}'
stringdef d^ '{U+010F}'
stringdef e' '{U+00E9}'
stringdef e^ '{U+011B}'
stringdef i' '{U+00ED}'
stringdef n^ '{U+0148}'
stringdef o' '{U+00F3}'
stringdef r^ '{U+0159}'
stringdef s^ '{U+0161}'
stringdef t^ '{U+0165}'
stringdef u' '{U+00FA}'
stringdef u* '{U+016F}'
stringdef y' '{U+00FD}'
stringdef z^ '{U+017E}'

define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}'

define mark_regions as (

\$pV = limit
\$p1 = limit

do (
gopast non-v setmark pV
gopast non-v gopast v setmark p1
)
)

backwardmode (

define RV as \$pV <= cursor
define R1 as \$p1 <= cursor

define palatalise as (
[substring] RV among (
'ci' 'ce' '{c^}i' '{c^}'
(<- 'k')
'zi' 'ze' '{z^}i' '{z^}e'
(<- 'h')
'{c^}t{e^}' '{c^}ti' '{c^}t{e'}'
(<- 'ck')
'{s^}t{e^}' '{s^}ti' '{s^}t{e'}'
(<- 'sk')
)
)

define do_possessive as (
[substring] RV among (
'ov' '{u*}v'
(delete)
'in'
(
delete
try palatalise
)
)
)

define do_case as (
[substring] among (
'atech'
'{e^}tem' 'at{u*}m'
'{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi'
'ata' 'aty' 'ama' 'ami' 'ovi'
'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou'
'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}'
(delete)
'ech' 'ich' '{i'}ch'
'{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi'
'emi' 'iho' 'imu'
'{e'}m' '{i'}m' 'es'
'e' 'i' '{i'}' '{e^}'
(
delete
try palatalise
)
'em'
(
<- 'e'
try palatalise
)
)
)

define do_derivational as (
[substring] R1 among (
'obinec'
'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k'
'{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin'
'{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k'
'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv'
'{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as'
'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn'
'{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk'
(delete)
'ion{a'}{r^}'
'inec' 'itel'
'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb'
'ic' 'in' 'it' 'iv'
(
<- 'i'
palatalise
)
'enic' 'ec' 'en'
(
<- 'e'
palatalise
)
'{e'}{r^}'
(
<- '{e'}'
palatalise
)
'{e^}n'
(
<- '{e^}'
palatalise
)
'{i'}rn'
'{i'}{r^}' '{i'}n'
(
<- '{i'}'
palatalise
)
)
)
define do_deriv_single as (
[substring] among (
'c' '{c^}' 'k' 'l' 'n' 't'
(delete)
)
)

define do_augmentative as (
[substring] among (
'ajzn' '{a'}k'
(delete)
'izn' 'isk'
(
<- 'i'
palatalise
)
)
)

define do_diminutive as (
[substring] among (
'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek'
'anek' 'onek' 'unek' '{a'}nek'
'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk'
'{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk'
'{a'}tk' '{a'}nk' 'u{s^}k'
'k'
(delete)
'e{c^}ek' 'enek' 'ek'
(
<- 'e'
palatalise
)
'{e'}{c^}ek' '{e'}k'
(
<- '{e'}'
palatalise
)
'i{c^}ek' 'inek' 'ik'
(
<- 'i'
palatalise
)
'{i'}{c^}ek' '{i'}k'
(
<- '{i'}'
palatalise
)
'{a'}k'
(<- '{a'}')
'ak'
(<- 'a')
'ok'
(<- 'o')
'uk'
(<- 'u')
)
)

define do_comparative as (
[substring] among (
'{e^}j{s^}'
(
<- '{e^}'
palatalise
)
'ej{s^}'
(
<- 'e'
palatalise
)
)
)

define do_aggressive as (
do do_comparative
do do_diminutive
do do_augmentative
do_derivational or do_deriv_single
)
)

define stem as (
do mark_regions
backwards (
do_case
do_possessive
// light and aggressive are the same to this point
// comment next line for light stemmer
do_aggressive
)
)

// Ljiljana Dolamic and Jacques Savoy. 2009.
// Indexing and stemming approaches for the Czech language.
// Inf. Process. Manage. 45, 6 (November 2009), 714-720.
// http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt
// http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt