/*
* Affix stripping stemming algorithm for Tamil
* By Damodharan Rajalingam
*/
stringescapes {}
/* Aytham */
stringdef aytham '{U+0B83}'
/* Uyir - independent vowels */
stringdef a '{U+0B85}'
stringdef aa '{U+0B86}'
stringdef i '{U+0B87}'
stringdef ii '{U+0B88}'
stringdef u '{U+0B89}'
stringdef uu '{U+0B8A}'
stringdef e '{U+0B8E}'
stringdef ee '{U+0B8F}'
stringdef ai '{U+0B90}'
stringdef o '{U+0B92}'
stringdef oo '{U+0B93}'
stringdef au '{U+0B94}'
/* Consonants */
stringdef ka '{U+0B95}'
stringdef nga '{U+0B99}'
stringdef ca '{U+0B9A}'
stringdef ja '{U+0B9C}'
stringdef nya '{U+0B9E}'
stringdef tta '{U+0B9F}'
stringdef nna '{U+0BA3}'
stringdef ta '{U+0BA4}'
stringdef tha '{U+0BA4}'
stringdef na '{U+0BA8}'
stringdef nnna '{U+0BA9}'
stringdef pa '{U+0BAA}'
stringdef ma '{U+0BAE}'
stringdef ya '{U+0BAF}'
stringdef ra '{U+0BB0}'
stringdef rra '{U+0BB1}'
stringdef la '{U+0BB2}'
stringdef lla '{U+0BB3}'
stringdef llla '{U+0BB4}'
stringdef zha '{U+0BB4}'
stringdef va '{U+0BB5}'
/* Vatamozi - borrowed */
stringdef sha '{U+0BB6}'
stringdef ssa '{U+0BB7}'
stringdef sa '{U+0BB8}'
stringdef ha '{U+0BB9}'
/* Dependent vowel signs (kombu etc.) */
stringdef vs_aa '{U+0BBE}'
stringdef vs_i '{U+0BBF}'
stringdef vs_ii '{U+0BC0}'
stringdef vs_u '{U+0BC1}'
stringdef vs_uu '{U+0BC2}'
stringdef vs_e '{U+0BC6}'
stringdef vs_ee '{U+0BC7}'
stringdef vs_ai '{U+0BC8}'
stringdef vs_o '{U+0BCA}'
stringdef vs_oo '{U+0BCB}'
stringdef vs_au '{U+0BCC}'
/* Pulli */
stringdef pulli '{U+0BCD}'
/* AU length mark */
stringdef au_lmark '{U+0BD7}'
routines (
remove_plural_suffix
remove_question_suffixes
remove_question_prefixes
remove_pronoun_prefixes
remove_command_suffixes
remove_um
remove_vetrumai_urupukal
fix_va_start
fix_ending
fix_endings
remove_tense_suffix
remove_tense_suffixes
remove_common_word_endings
has_min_length
)
externals ( stem )
booleans (
found_a_match
found_vetrumai_urupu
)
define has_min_length as (
$(len > 4)
)
define fix_va_start as (
[substring] among (
'{va}{vs_oo}' ( <- '{oo}' )
'{va}{vs_o}' ( <- '{o}' )
'{va}{vs_u}' ( <- '{u}' )
'{va}{vs_uu}' ( <- '{uu}' )
)
)
define fix_endings as (
do repeat fix_ending
)
define remove_question_prefixes as (
[ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
do fix_va_start
)
// Gives signal t if an ending was fixed, signal f otherwise.
define fix_ending as (
$(len > 3)
backwards (
(
[substring] among (
'{na}{pulli}'
'{na}{pulli}{ta}'
'{na}{pulli}{ta}{pulli}'
( delete )
'{ya}{pulli}'
( test among('{vs_ai}' '{vs_i}' '{vs_ii}') delete )
'{tta}{pulli}{pa}{pulli}'
'{tta}{pulli}{ka}{pulli}'
( <- '{lla}{pulli}' )
'{nnna}{pulli}{rra}{pulli}'
( <- '{la}{pulli}' )
'{rra}{pulli}{ka}{pulli}'
// '{nnna}{pulli}{nnna}{pulli}'
( <- '{la}{pulli}' )
'{tta}{pulli}{tta}{pulli}'
( <- '{tta}{vs_u}' )
'{ta}{pulli}{ta}{pulli}'
( found_vetrumai_urupu not '{vs_ai}' <- '{ma}{pulli}' )
'{vs_u}{ka}{pulli}'
'{vs_u}{ka}{pulli}{ka}{pulli}'
( <- '{pulli}' )
'{va}'
'{ya}'
'{va}{pulli}'
( delete )
'{nnna}{vs_u}'
(
not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')
delete
)
'{nga}{pulli}'
(
among (
'{vs_ai}' ( delete )
'{pulli}' ( delete )
'' ( <- '{ma}{pulli}' )
)
)
)
) or
( [ '{pulli}'
(
( among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')
try ( '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') )
] delete )
or
( among(
'{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}'
'{nya}' '{nna}' '{na}' '{ma}' '{nnna}') ] '{pulli}' delete )
or
( test among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}' '{pulli}') ] delete )
)
)
)
)
define remove_pronoun_prefixes as (
[ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
do fix_va_start
)
define remove_plural_suffix as (
backwards (
[substring] among (
'{vs_u}{nga}{pulli}{ka}{lla}{pulli}'
( ( among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') <- '{vs_u}{nga}{pulli}' )
or <- '{pulli}' )
'{rra}{pulli}{ka}{lla}{pulli}'
( <- '{la}{pulli}' )
'{tta}{pulli}{ka}{lla}{pulli}'
( <- '{lla}{pulli}' )
'{ka}{lla}{pulli}'
( delete )
)
)
)
define remove_question_suffixes as (
has_min_length
backwards (
do (
[ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}'
)
)
do fix_endings
)
define remove_command_suffixes as (
has_min_length
backwards (
[ among('{pa}{vs_i}' '{va}{vs_i}') ] delete
)
)
define remove_um as (
has_min_length
backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}'
)
do fix_ending
)
define remove_common_word_endings as (
// These are not suffixes actually but are
// some words that are attached to other words
// but can be removed for stemming
has_min_length
backwards (
[substring] among (
'{vs_u}{tta}{nnna}{pulli}'
'{vs_i}{la}{pulli}{la}{vs_ai}'
'{vs_i}{tta}{ma}{pulli}'
'{vs_i}{nnna}{pulli}{rra}{vs_i}'
'{vs_aa}{ka}{vs_i}'
'{vs_aa}{ka}{vs_i}{ya}'
'{vs_e}{nnna}{pulli}{rra}{vs_u}'
'{vs_u}{lla}{pulli}{lla}'
'{vs_u}{tta}{vs_ai}{ya}'
'{vs_u}{tta}{vs_ai}'
'{vs_e}{nnna}{vs_u}{ma}{pulli}'
'{vs_e}{nnna}'
( <- '{pulli}' )
'{la}{pulli}{la}'
(
not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')
<- '{pulli}'
)
'{pa}{tta}{vs_u}'
'{pa}{tta}{pulli}{tta}'
'{pa}{tta}{pulli}{tta}{vs_u}'
'{pa}{tta}{pulli}{tta}{ta}{vs_u}'
'{pa}{tta}{pulli}{tta}{nna}'
'{ka}{vs_u}{ra}{vs_i}{ya}'
'{pa}{rra}{pulli}{rra}{vs_i}'
'{va}{vs_i}{tta}{vs_u}'
'{va}{vs_i}{tta}{pulli}{tta}{vs_u}'
'{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}'
'{pa}{tta}{vs_i}'
'{ta}{vs_aa}{nnna}'
'{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}'
( delete )
)
)
do fix_endings
)
define remove_vetrumai_urupukal as (
unset found_vetrumai_urupu
has_min_length
backwards (
(
test (
[substring] among (
'{nnna}{vs_ai}'
( delete )
'{vs_o}{tta}{vs_u}'
'{vs_oo}{tta}{vs_u}'
'{vs_i}{la}{pulli}'
'{vs_i}{rra}{pulli}'
'{vs_i}{nnna}{pulli}{rra}{vs_u}'
'{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}'
'{va}{vs_i}{tta}'
'{vs_aa}{la}{pulli}'
'{vs_u}{tta}{vs_ai}'
'{vs_aa}{ma}{la}{pulli}'
'{vs_u}{lla}{pulli}'
( <- '{pulli}' )
'{vs_i}{nnna}{pulli}'
( not '{ma}' <- '{pulli}' )
'{vs_i}{tta}{ma}{pulli}'
( $(len >= 7) <- '{pulli}' )
'{la}{pulli}'
(
not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')
<- '{pulli}'
)
'{ka}{nna}{pulli}'
'{ma}{vs_u}{nnna}{pulli}'
'{ma}{vs_ee}{la}{pulli}'
'{ma}{vs_ee}{rra}{pulli}'
'{ka}{vs_ii}{llla}{pulli}'
(delete)
'{ta}{vs_u}'
(
not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')
delete
)
'{vs_ii}'
( <- '{vs_i}' )
)
)
or
test (
[ '{vs_ai}'
(
(not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))
or
(test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}'))
)
] <- '{pulli}'
)
)
(set found_vetrumai_urupu)
do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' )
)
do fix_endings
)
define remove_tense_suffixes as (
repeat remove_tense_suffix
)
// Gives signal t if a tense suffix was removed, signal f otherwise.
define remove_tense_suffix as (
unset found_a_match
has_min_length
backwards (
do (
test (
[substring] among (
'{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}'
'{pa}{tta}{vs_u}'
'{ma}{vs_aa}{ra}{pulli}'
'{ma}{vs_i}{nnna}{pulli}'
'{nnna}{nnna}{pulli}'
'{nnna}{vs_aa}{nnna}{pulli}'
'{nnna}{vs_aa}{lla}{pulli}'
'{nnna}{vs_aa}{ra}{pulli}'
'{nnna}{lla}{pulli}'
'{va}{lla}{pulli}'
'{nnna}{ra}{pulli}'
'{va}{ra}{pulli}'
'{nnna}'
'{pa}'
'{ka}'
'{ta}'
'{ya}'
'{pa}{nnna}{pulli}'
'{pa}{lla}{pulli}'
'{pa}{ra}{pulli}'
'{vs_i}{rra}{pulli}{rra}{vs_u}'
'{pa}{ma}{pulli}'
'{nnna}{ma}{pulli}'
'{ta}{vs_u}{ma}{pulli}'
'{rra}{vs_u}{ma}{pulli}'
'{ka}{vs_u}{ma}{pulli}'
'{nnna}{vs_e}{nnna}{pulli}'
'{nnna}{vs_ai}'
'{va}{vs_ai}'
( delete )
'{va}{nnna}{pulli}'
(
not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')
delete
)
'{ta}{vs_u}'
(
not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')
delete
)
'{vs_aa}{nnna}{pulli}'
( not '{ca}' <- '{pulli}' )
'{vs_aa}{lla}{pulli}'
'{vs_aa}{ra}{pulli}'
'{vs_ee}{nnna}{pulli}'
'{vs_aa}'
'{vs_aa}{ma}{pulli}'
'{vs_e}{ma}{pulli}'
'{vs_ee}{ma}{pulli}'
'{vs_oo}{ma}{pulli}'
'{tta}{vs_u}{ma}{pulli}'
'{vs_aa}{ya}{pulli}'
'{nnna}{vs_i}{ra}{pulli}'
'{vs_ii}{ra}{pulli}'
'{vs_ii}{ya}{ra}{pulli}'
( <- '{pulli}' )
'{ka}{vs_u}'
( test '{pulli}' delete )
)
(set found_a_match)
)
)
do ([among(
'{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}'
'{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}'
'{ka}{vs_i}{nnna}{pulli}{rra}'
'{ka}{vs_i}{nnna}{pulli}{rra}{pulli}'
'{ka}{vs_i}{rra}'
'{ka}{vs_i}{rra}{pulli}'
)] delete
(set found_a_match)
)
)
do fix_endings
found_a_match
)
define stem as (
unset found_vetrumai_urupu
do fix_ending
has_min_length
do remove_question_prefixes
do remove_pronoun_prefixes
do remove_question_suffixes
do remove_um
do remove_common_word_endings
do remove_vetrumai_urupukal
do remove_plural_suffix
do remove_command_suffixes
do remove_tense_suffixes
)