Tamil stemming algorithm

Links to resources

History of functional changes to the algorithm

  • October 2015: Contributed by Damodharan Rajalingam

The stemming algorithm

/*
* Affix stripping stemming algorithm for Tamil
* By Damodharan Rajalingam
*/

stringescapes {}

/* Aytham */
stringdef aytham   '{U+0B83}'

/* Uyir - independent vowels */
stringdef a        '{U+0B85}'
stringdef aa       '{U+0B86}'
stringdef i        '{U+0B87}'
stringdef ii       '{U+0B88}'
stringdef u        '{U+0B89}'
stringdef uu       '{U+0B8A}'
stringdef e        '{U+0B8E}'
stringdef ee       '{U+0B8F}'
stringdef ai       '{U+0B90}'
stringdef o        '{U+0B92}'
stringdef oo       '{U+0B93}'
stringdef au       '{U+0B94}'

/* Consonants */
stringdef ka       '{U+0B95}'
stringdef nga      '{U+0B99}'
stringdef ca       '{U+0B9A}'
stringdef ja       '{U+0B9C}'
stringdef nya      '{U+0B9E}'
stringdef tta      '{U+0B9F}'
stringdef nna      '{U+0BA3}'
stringdef ta       '{U+0BA4}'
stringdef tha      '{U+0BA4}'
stringdef na       '{U+0BA8}'
stringdef nnna     '{U+0BA9}'
stringdef pa       '{U+0BAA}'
stringdef ma       '{U+0BAE}'
stringdef ya       '{U+0BAF}'
stringdef ra       '{U+0BB0}'
stringdef rra      '{U+0BB1}'
stringdef la       '{U+0BB2}'
stringdef lla      '{U+0BB3}'
stringdef llla     '{U+0BB4}'
stringdef zha      '{U+0BB4}'
stringdef va       '{U+0BB5}'

/* Vatamozi - borrowed */
stringdef sha      '{U+0BB6}'
stringdef ssa      '{U+0BB7}'
stringdef sa       '{U+0BB8}'
stringdef ha       '{U+0BB9}'


/* Dependent vowel signs (kombu etc.) */
stringdef vs_aa    '{U+0BBE}'
stringdef vs_i     '{U+0BBF}'
stringdef vs_ii    '{U+0BC0}'
stringdef vs_u     '{U+0BC1}'
stringdef vs_uu    '{U+0BC2}'
stringdef vs_e     '{U+0BC6}'
stringdef vs_ee    '{U+0BC7}'
stringdef vs_ai    '{U+0BC8}'
stringdef vs_o     '{U+0BCA}'
stringdef vs_oo    '{U+0BCB}'
stringdef vs_au    '{U+0BCC}'

/* Pulli */
stringdef pulli    '{U+0BCD}'

/* AU length mark */
stringdef au_lmark '{U+0BD7}'


routines (
 remove_plural_suffix
 remove_question_suffixes
 remove_question_prefixes
 remove_pronoun_prefixes
 remove_command_suffixes
 remove_um
 remove_vetrumai_urupukal
 fix_va_start
 fix_ending
 fix_endings
 remove_tense_suffix
 remove_tense_suffixes
 remove_common_word_endings
 has_min_length
)

externals ( stem )

booleans (
 found_a_match
 found_vetrumai_urupu
)

define has_min_length as (
 $(len > 4)
)

define fix_va_start as (
 [substring] among (
  '{va}{vs_oo}' ( <- '{oo}' )
  '{va}{vs_o}'  ( <- '{o}' )
  '{va}{vs_u}'  ( <- '{u}' )
  '{va}{vs_uu}' ( <- '{uu}' )
 )
)

define fix_endings as (
 do repeat fix_ending
)

define remove_question_prefixes as (
 [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
 do fix_va_start
)

// Gives signal t if an ending was fixed, signal f otherwise.
define fix_ending as (
 $(len > 3)
 backwards (
  (
  [substring] among (
    '{na}{pulli}'
    '{na}{pulli}{ta}'
    '{na}{pulli}{ta}{pulli}'
      ( delete )
    '{ya}{pulli}'
      ( test among('{vs_ai}' '{vs_i}' '{vs_ii}') delete )
    '{tta}{pulli}{pa}{pulli}'
    '{tta}{pulli}{ka}{pulli}'
      ( <- '{lla}{pulli}' )
    '{nnna}{pulli}{rra}{pulli}'
      ( <- '{la}{pulli}' )
    '{rra}{pulli}{ka}{pulli}'
    // '{nnna}{pulli}{nnna}{pulli}'
      ( <- '{la}{pulli}' )
    '{tta}{pulli}{tta}{pulli}'
      ( <- '{tta}{vs_u}' )
    '{ta}{pulli}{ta}{pulli}'
      ( found_vetrumai_urupu not '{vs_ai}' <- '{ma}{pulli}' )
    '{vs_u}{ka}{pulli}'
    '{vs_u}{ka}{pulli}{ka}{pulli}'
      ( <- '{pulli}' )
    '{va}'
    '{ya}'
    '{va}{pulli}'
      ( delete )
    '{nnna}{vs_u}'
      (
        not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')
        delete
      )
    '{nga}{pulli}'
      (
        among (
          '{vs_ai}' ( delete )
          '{pulli}' ( delete )
          '' ( <- '{ma}{pulli}' )
        )
      )
  )
  ) or
  ( [ '{pulli}'
    (
      ( among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')
        try ( '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') )
        ] delete )
      or
      ( among(
            '{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}'
            '{nya}' '{nna}' '{na}' '{ma}' '{nnna}') ] '{pulli}' delete )
      or
      ( test among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}' '{pulli}') ] delete )
    )
  )
 )
)

define remove_pronoun_prefixes as (
 [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
 do fix_va_start
)

define remove_plural_suffix as (
 backwards (
  [substring] among (
    '{vs_u}{nga}{pulli}{ka}{lla}{pulli}'
      ( ( among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') <- '{vs_u}{nga}{pulli}' )
        or <- '{pulli}' )
    '{rra}{pulli}{ka}{lla}{pulli}'
      ( <- '{la}{pulli}' )
    '{tta}{pulli}{ka}{lla}{pulli}'
      ( <- '{lla}{pulli}' )
    '{ka}{lla}{pulli}'
      ( delete )
  )
 )
)

define remove_question_suffixes as (
 has_min_length
 backwards (
  do (
   [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}'
  )
 )
 do fix_endings
)

define remove_command_suffixes as (
 has_min_length
 backwards (
  [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete
 )
)

define remove_um as (
 has_min_length
 backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}'
    )
 do fix_ending
)

define remove_common_word_endings as (
 // These are not suffixes actually but are
 // some words that are attached to other words
 // but can be removed for stemming
 has_min_length
 backwards (
   [substring] among (
     '{vs_u}{tta}{nnna}{pulli}'
     '{vs_i}{la}{pulli}{la}{vs_ai}'
     '{vs_i}{tta}{ma}{pulli}'
     '{vs_i}{nnna}{pulli}{rra}{vs_i}'
     '{vs_aa}{ka}{vs_i}'
     '{vs_aa}{ka}{vs_i}{ya}'
     '{vs_e}{nnna}{pulli}{rra}{vs_u}'
     '{vs_u}{lla}{pulli}{lla}'
     '{vs_u}{tta}{vs_ai}{ya}'
     '{vs_u}{tta}{vs_ai}'
     '{vs_e}{nnna}{vs_u}{ma}{pulli}'
     '{vs_e}{nnna}'
       ( <- '{pulli}' )
     '{la}{pulli}{la}'
       (
         not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')
         <- '{pulli}'
       )
     '{pa}{tta}{vs_u}'
     '{pa}{tta}{pulli}{tta}'
     '{pa}{tta}{pulli}{tta}{vs_u}'
     '{pa}{tta}{pulli}{tta}{ta}{vs_u}'
     '{pa}{tta}{pulli}{tta}{nna}'
     '{ka}{vs_u}{ra}{vs_i}{ya}'
     '{pa}{rra}{pulli}{rra}{vs_i}'
     '{va}{vs_i}{tta}{vs_u}'
     '{va}{vs_i}{tta}{pulli}{tta}{vs_u}'
     '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}'
     '{pa}{tta}{vs_i}'
     '{ta}{vs_aa}{nnna}'
     '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}'
       ( delete )
  )
 )
 do fix_endings
)

define remove_vetrumai_urupukal as (
 unset found_vetrumai_urupu
 has_min_length
 backwards (
  (
   test (
     [substring] among (
       '{nnna}{vs_ai}'
         ( delete )
       '{vs_o}{tta}{vs_u}'
       '{vs_oo}{tta}{vs_u}'
       '{vs_i}{la}{pulli}'
       '{vs_i}{rra}{pulli}'
       '{vs_i}{nnna}{pulli}{rra}{vs_u}'
       '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}'
       '{va}{vs_i}{tta}'
       '{vs_aa}{la}{pulli}'
       '{vs_u}{tta}{vs_ai}'
       '{vs_aa}{ma}{la}{pulli}'
       '{vs_u}{lla}{pulli}'
         ( <- '{pulli}' )
       '{vs_i}{nnna}{pulli}'
         ( not '{ma}' <- '{pulli}' )
       '{vs_i}{tta}{ma}{pulli}'
         ( $(len >= 7) <- '{pulli}' )
       '{la}{pulli}'
         (
           not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')
           <- '{pulli}'
         )
       '{ka}{nna}{pulli}'
       '{ma}{vs_u}{nnna}{pulli}'
       '{ma}{vs_ee}{la}{pulli}'
       '{ma}{vs_ee}{rra}{pulli}'
       '{ka}{vs_ii}{llla}{pulli}'
         (delete)
       '{ta}{vs_u}'
         (
           not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')
           delete
         )
       '{vs_ii}'
         ( <- '{vs_i}' )
      )
   )
   or
   test (
     [ '{vs_ai}'
     (
       (not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))
     or
       (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}'))
     )
     ] <- '{pulli}'
   )
  )
  (set found_vetrumai_urupu)
  do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' )
 )
 do fix_endings
)

define remove_tense_suffixes as (
 repeat remove_tense_suffix
)

// Gives signal t if a tense suffix was removed, signal f otherwise.
define remove_tense_suffix as (
 unset found_a_match
 has_min_length
 backwards (
  do (
   test (
      [substring] among (
        '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}'
        '{pa}{tta}{vs_u}'
        '{ma}{vs_aa}{ra}{pulli}'
        '{ma}{vs_i}{nnna}{pulli}'
        '{nnna}{nnna}{pulli}'
        '{nnna}{vs_aa}{nnna}{pulli}'
        '{nnna}{vs_aa}{lla}{pulli}'
        '{nnna}{vs_aa}{ra}{pulli}'
        '{nnna}{lla}{pulli}'
        '{va}{lla}{pulli}'
        '{nnna}{ra}{pulli}'
        '{va}{ra}{pulli}'
        '{nnna}'
        '{pa}'
        '{ka}'
        '{ta}'
        '{ya}'
        '{pa}{nnna}{pulli}'
        '{pa}{lla}{pulli}'
        '{pa}{ra}{pulli}'
        '{vs_i}{rra}{pulli}{rra}{vs_u}'
        '{pa}{ma}{pulli}'
        '{nnna}{ma}{pulli}'
        '{ta}{vs_u}{ma}{pulli}'
        '{rra}{vs_u}{ma}{pulli}'
        '{ka}{vs_u}{ma}{pulli}'
        '{nnna}{vs_e}{nnna}{pulli}'
        '{nnna}{vs_ai}'
        '{va}{vs_ai}'
            ( delete )
        '{va}{nnna}{pulli}'
            (
              not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')
              delete
            )
        '{ta}{vs_u}'
            (
              not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')
              delete
            )
        '{vs_aa}{nnna}{pulli}'
            ( not '{ca}' <- '{pulli}' )
        '{vs_aa}{lla}{pulli}'
        '{vs_aa}{ra}{pulli}'
        '{vs_ee}{nnna}{pulli}'
        '{vs_aa}'
        '{vs_aa}{ma}{pulli}'
        '{vs_e}{ma}{pulli}'
        '{vs_ee}{ma}{pulli}'
        '{vs_oo}{ma}{pulli}'
        '{tta}{vs_u}{ma}{pulli}'
        '{vs_aa}{ya}{pulli}'
        '{nnna}{vs_i}{ra}{pulli}'
        '{vs_ii}{ra}{pulli}'
        '{vs_ii}{ya}{ra}{pulli}'
            ( <- '{pulli}' )
        '{ka}{vs_u}'
            ( test '{pulli}' delete )
      )
      (set found_a_match)
      )
  )
  do ([among(
              '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}'
              '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}'
              '{ka}{vs_i}{nnna}{pulli}{rra}'
              '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}'
              '{ka}{vs_i}{rra}'
              '{ka}{vs_i}{rra}{pulli}'
            )] delete
    (set found_a_match)
    )
 )
 do fix_endings
 found_a_match
)

define stem as (
 unset found_vetrumai_urupu
 do fix_ending
 has_min_length
 do remove_question_prefixes
 do remove_pronoun_prefixes
 do remove_question_suffixes
 do remove_um
 do remove_common_word_endings
 do remove_vetrumai_urupukal
 do remove_plural_suffix
 do remove_command_suffixes
 do remove_tense_suffixes
)