Nepali stemming algorithm

Links to resources

History of functional changes to the algorithm

  • April 2018: Contributed by Ingroj Shrestha, Oleg Bartunov and Shreeya Singh Dhakal

The stemming algorithm

/*
 * Authors:
 * - Ingroj Shrestha <ing.stha@gmail.com>, Nepali NLP Group
 * - Oleg Bartunov <obartunov@gmail.com>, Postgres Professional Ltd.
 * - Shreeya Singh Dhakal, Nepali NLP Group
 */

routines (
  remove_category_1
  remove_category_2
  remove_category_3
)

stringescapes {}

stringdef dsc     '{U+0901}'  // DEVANAGARI_SIGN_CANDRABINDU
stringdef dsa     '{U+0902}'  // DEVANAGARI_SIGN_ANUSVARA
stringdef dli     '{U+0907}'  // DEVANAGARI_LETTER_I
stringdef dlii    '{U+0908}'  // DEVANAGARI_LETTER_II
stringdef dle     '{U+090F}'  // DEVANAGARI_LETTER_E
stringdef dlka    '{U+0915}'  // DEVANAGARI_LETTER_KA
stringdef dlkha   '{U+0916}'  // DEVANAGARI_LETTER_KHA
stringdef dlg     '{U+0917}'  // DEVANAGARI_LETTER_GA
stringdef dlc     '{U+091B}'  // DEVANAGARI_LETTER_CHA
stringdef dlta    '{U+0924}'  // DEVANAGARI_LETTER_TA
stringdef dltha   '{U+0925}'  // DEVANAGARI_LETTER_THA
stringdef dld     '{U+0926}'  // DEVANAGARI_LETTER_DA
stringdef dln     '{U+0928}'  // DEVANAGARI_LETTER_NA
stringdef dlpa    '{U+092A}'  // DEVANAGARI_LETTER_PA
stringdef dlpha   '{U+092B}'  // DEVANAGARI_LETTER_PHA
stringdef dlb     '{U+092D}'  // DEVANAGARI_LETTER_BHA
stringdef dlm     '{U+092E}'  // DEVANAGARI_LETTER_MA
stringdef dly     '{U+092F}'  // DEVANAGARI_LETTER_YA
stringdef dlr     '{U+0930}'  // DEVANAGARI_LETTER_RA
stringdef dll     '{U+0932}'  // DEVANAGARI_LETTER_LA
stringdef dlv     '{U+0935}'  // DEVANAGARI_LETTER_VA
stringdef dls     '{U+0938}'  // DEVANAGARI_LETTER_SA
stringdef dlh     '{U+0939}'  // DEVANAGARI_LETTER_HA
stringdef dvsaa   '{U+093E}'  // DEVANAGARI_VOWEL_SIGN_AA
stringdef dvsi    '{U+093F}'  // DEVANAGARI_VOWEL_SIGN_I
stringdef dvsii   '{U+0940}'  // DEVANAGARI_VOWEL_SIGN_II
stringdef dvsu    '{U+0941}'  // DEVANAGARI_VOWEL_SIGN_U
stringdef dvsuu   '{U+0942}'  // DEVANAGARI_VOWEL_SIGN_UU
stringdef dvse    '{U+0947}'  // DEVANAGARI_VOWEL_SIGN_E
stringdef dvsai   '{U+0948}'  // DEVANAGARI_VOWEL_SIGN_AI
stringdef dvso    '{U+094B}'  // DEVANAGARI_VOWEL_SIGN_O
stringdef dvsau   '{U+094C}'  // DEVANAGARI_VOWEL_SIGN_AU
stringdef dsv     '{U+094D}'  // DEVANAGARI_SIGN_VIRAMA

externals ( stem )

backwardmode (
  define remove_category_1 as(
    [substring] among (
      '{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}'
      '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}'
      '{dls}{dsc}{dlg}{dvsai}'
      '{dls}{dsa}{dlg}'
      '{dls}{dsc}{dlg}'
      '{dll}{dvsaa}{dli}'
      '{dll}{dvsaa}{dlii}'
      '{dlpa}{dlc}{dvsi}'
      '{dll}{dvse}'
      '{dlr}{dlta}'
      '{dlm}{dvsai}'
      '{dlm}{dvsaa}'
        (delete)
      '{dlka}{dvso}'
      '{dlka}{dvsaa}'
      '{dlka}{dvsi}'
      '{dlka}{dvsii}'
      '{dlka}{dvsai}'
        ('{dle}' or '{dvse}' or delete)
    )
  )

  define remove_category_2 as (
    [substring] among(
      '{dsc}' '{dsa}'
        ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete)
      '{dvsai}'
        ('{dlta}{dsv}{dlr}' delete)
    )
  )

  define remove_category_3 as(
    [substring] among(
      '{dltha}{dvsi}{dli}{dls}{dsv}'
      '{dlh}{dvsu}{dln}{dvse}{dlc}'
      '{dlh}{dvsu}{dln}{dsv}{dlc}'
      '{dln}{dvse}{dlc}{dls}{dsv}'
      '{dln}{dvse}{dlc}{dln}{dsv}'
      '{dli}{dle}{dlka}{dvsii}'
      '{dli}{dle}{dlka}{dvsaa}'
      '{dli}{dle}{dlka}{dvso}'
      '{dvsi}{dle}{dlka}{dvsii}'
      '{dvsi}{dle}{dlka}{dvsaa}'
      '{dvsi}{dle}{dlka}{dvso}'
      '{dli}{dlc}{dln}{dsv}'
      '{dvsi}{dlc}{dln}{dsv}'
      '{dli}{dlc}{dls}{dsv}'
      '{dvsi}{dlc}{dls}{dsv}'
      '{dle}{dlc}{dln}{dsv}'
      '{dvse}{dlc}{dln}{dsv}'
      '{dle}{dlc}{dls}{dsv}'
      '{dvse}{dlc}{dls}{dsv}'
      '{dlc}{dvsi}{dln}{dsv}'
      '{dlc}{dvse}{dls}{dsv}'
      '{dlc}{dsv}{dly}{dvsau}'
      '{dltha}{dvsi}{dln}{dsv}'
      '{dltha}{dvsi}{dly}{dvso}'
      '{dltha}{dvsi}{dly}{dvsau}'
      '{dltha}{dvsi}{dls}{dsv}'
      '{dltha}{dsv}{dly}{dvso}'
      '{dltha}{dsv}{dly}{dvsau}'
      '{dld}{dvsi}{dly}{dvso}'
      '{dld}{dvse}{dlkha}{dvsi}'
      '{dld}{dvse}{dlkha}{dvsii}'
      '{dll}{dvsaa}{dln}{dsv}'
      '{dlm}{dvsaa}{dltha}{dvsi}'
      '{dln}{dvse}{dlka}{dvsai}'
      '{dln}{dvse}{dlka}{dvsaa}'
      '{dln}{dvse}{dlka}{dvso}'
      '{dln}{dvse}{dlc}{dvsau}'
      '{dlh}{dvso}{dls}{dsv}'
      '{dli}{dln}{dsv}{dlc}'
      '{dvsi}{dln}{dsv}{dlc}'
      '{dln}{dvse}{dlc}{dvsu}'
      '{dli}{dlc}{dvsau}'
      '{dvsi}{dlc}{dvsau}'
      '{dli}{dls}{dsv}'
      '{dvsi}{dls}{dsv}'
      '{dvsi}{dly}{dvso}'
      '{dli}{dly}{dvso}'
      '{dle}{dlka}{dvsaa}'
      '{dvse}{dlka}{dvsaa}'
      '{dle}{dlka}{dvsii}'
      '{dvse}{dlka}{dvsii}'
      '{dle}{dlka}{dvsai}'
      '{dvse}{dlka}{dvsai}'
      '{dle}{dlka}{dvso}'
      '{dvse}{dlka}{dvso}'
      '{dle}{dlc}{dvsu}'
      '{dvse}{dlc}{dvsu}'
      '{dle}{dlc}{dvsau}'
      '{dvse}{dlc}{dvsau}'
      '{dlc}{dln}{dsv}'
      '{dlc}{dls}{dsv}'
      '{dltha}{dvsi}{dle}'
      '{dlpa}{dlr}{dsv}'
      '{dlb}{dly}{dvso}'
      '{dlh}{dlr}{dvsu}'
      '{dlh}{dlr}{dvsuu}'
      '{dvsi}{dld}{dvsaa}'
      '{dli}{dld}{dvsaa}'
      '{dvsi}{dld}{dvso}'
      '{dli}{dld}{dvso}'
      '{dvsi}{dld}{dvsai}'
      '{dli}{dld}{dvsai}'
      '{dln}{dvse}{dlc}'
      '{dli}{dlc}'
      '{dvsi}{dlc}'
      '{dle}{dlc}'
      '{dvse}{dlc}'
      '{dlc}{dvsu}'
      '{dlc}{dvse}'
      '{dlc}{dvsau}'
      '{dltha}{dvsii}'
      '{dltha}{dvse}'
      '{dld}{dvsaa}'
      '{dld}{dvsii}'
      '{dld}{dvsai}'
      '{dld}{dvso}'
      '{dln}{dvsu}'
      '{dln}{dvse}'
      '{dly}{dvso}'
      '{dly}{dvsau}'
      '{dlc}'
        (delete)
    )
  )

)

define stem as (
  backwards (
    do remove_category_1
    repeat (
      do remove_category_2
      remove_category_3
    )
  )
)