Armenian stemming algorithm

Links to resources

Here is a sample of Armenian vocabulary, with the stemmed forms that will be generated by this algorithm:

word stem          word stem
աղոթում
աղոթք
աղոթքը
աղոթքի
աղոթքին
աղոթքից
աղոթքն
աղոթքներ
աղոթքները
աղոթքների
աղոթքներին
աղոթ
աղոթ
աղոթ
աղոթ
աղոթ
աղոթ
աղոթ
աղոթ
աղոթ
աղոթ
աղոթ
բանաձևեր
բանաձևերը
բանաձևերի
բանաձևերից
բանաձևերն
բանաձևերով
բանաձևերում
բանաձևը
բանաձևի
բանաձևին
բանաձևից
բանաձև
բանաձև
բանաձև
բանաձև
բանաձև
բանաձև
բանաձև
բանաձև
բանաձև
բանաձև
բանաձև

This stemmer for Armenian was developed and contributed by Astghik Mkrtchyan.

The following characters are vowels for the purposes of this algorithm:

ա է ի օ ւ ե ո ը

R2 is the region after the first non-vowel following a vowel after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.

RV has the same definition as in the Spanish stemmer.

The algorithm has a fairly simple structure which only removes suffixes. There are four steps, applied in turn:

  • an "ending" is removed, if one is found in R2,
  • a verb suffix is removed, if one is found in RV,
  • an adjective suffix is removed, if one is found in RV,
  • a noun suffix is removed, if one is found in RV.

See the Snowball implementation of the stemmer below for the lists of suffixes each step checks for.

The full algorithm in Snowball

stringescapes {}

stringdef a    '{U+0561}' // 531
stringdef b    '{U+0562}' // 532
stringdef g    '{U+0563}' // 533
stringdef d    '{U+0564}' // 534
stringdef ye   '{U+0565}' // 535
stringdef z    '{U+0566}' // 536
stringdef e    '{U+0567}' // 537
stringdef y    '{U+0568}' // 538
stringdef dt   '{U+0569}' // 539
stringdef zh   '{U+056A}' // 53A
stringdef i    '{U+056B}' // 53B
stringdef l    '{U+056C}' // 53C
stringdef kh   '{U+056D}' // 53D
stringdef ts   '{U+056E}' // 53E
stringdef k    '{U+056F}' // 53F
stringdef h    '{U+0570}' // 540
stringdef dz   '{U+0571}' // 541
stringdef gh   '{U+0572}' // 542
stringdef djch '{U+0573}' // 543
stringdef m    '{U+0574}' // 544
stringdef j    '{U+0575}' // 545
stringdef n    '{U+0576}' // 546
stringdef sh   '{U+0577}' // 547
stringdef vo   '{U+0578}' // 548
stringdef ch   '{U+0579}' // 549
stringdef p    '{U+057A}' // 54A
stringdef dj   '{U+057B}' // 54B
stringdef r    '{U+057C}' // 54C
stringdef s    '{U+057D}' // 54D
stringdef v    '{U+057E}' // 54E
stringdef t    '{U+057F}' // 54F
stringdef r'   '{U+0580}' // 550
stringdef c    '{U+0581}' // 551
stringdef u    '{U+0582}' // 552                  //vjun
stringdef bp   '{U+0583}' // 553
stringdef q    '{U+0584}' // 554
stringdef ev   '{U+0587}'
stringdef o    '{U+0585}' // 555
stringdef f    '{U+0586}' // 556

routines ( mark_regions R2
           adjective
           verb
           noun
           ending
)

externals ( stem )

integers ( pV p2 )

groupings ( v )

define v '{a}{e}{i}{o}{u}{ye}{vo}{y}'

define mark_regions as (

    $pV = limit
    $p2 = limit
    do (
        gopast v  setmark pV  gopast non-v
        gopast v  gopast non-v  setmark p2
       )
)

backwardmode (

    define R2 as $p2 <= cursor

    define adjective as (
        [substring] among (
            '{b}{a}{r'}'
            '{p}{ye}{s}'
            '{vo}{r'}{e}{n}'
            '{vo}{v}{i}{n}'
            '{a}{k}{i}'
            '{l}{a}{j}{n}'
            '{r'}{vo}{r'}{d}'
            '{ye}{r'}{vo}{r'}{d}'
            '{a}{k}{a}{n}'
            '{a}{l}{i}'
            '{k}{vo}{t}'
            '{ye}{k}{ye}{n}'
            '{vo}{r'}{a}{k}'
            '{ye}{gh}'
            '{v}{vo}{u}{n}'
            '{ye}{r'}{ye}{n}'
            '{a}{r'}{a}{n}'
            '{ye}{n}'
            '{a}{v}{ye}{t}'
            '{g}{i}{n}'
            '{i}{v}'
            '{a}{t}'
            '{i}{n}'

              (delete)
        )
    )

    define verb as (
        [substring] among (
            '{vo}{u}{m}'
            '{v}{vo}{u}{m}'
            '{a}{l}{vo}{u}'
            '{ye}{l}{vo}{u}'
            '{v}{ye}{l}'
            '{a}{n}{a}{l}'
            '{ye}{l}{vo}{u}{c}'
            '{a}{l}{vo}{u}{c}'
            '{y}{a}{l}'
            '{y}{ye}{l}'
            '{a}{l}{vo}{v}'
            '{ye}{l}{vo}{v}'
            '{a}{l}{i}{s}'
            '{ye}{l}{i}{s}'
            '{ye}{n}{a}{l}'
            '{a}{c}{n}{a}{l}'
            '{ye}{c}{n}{ye}{l}'
            '{c}{n}{ye}{l}'
            '{n}{ye}{l}'
            '{a}{t}{ye}{l}'
            '{vo}{t}{ye}{l}'
            '{k}{vo}{t}{ye}{l}'
            '{t}{ye}{l}'
            '{v}{a}{ts}'
            '{ye}{c}{v}{ye}{l}'
            '{a}{c}{v}{ye}{l}'
            '{ye}{c}{i}{r'}'
            '{a}{c}{i}{r'}'
            '{ye}{c}{i}{n}{q}'
            '{a}{c}{i}{n}{q}'
            '{v}{ye}{c}{i}{r'}'
            '{v}{ye}{c}{i}{n}{q}'
            '{v}{ye}{c}{i}{q}'
            '{v}{ye}{c}{i}{n}'
            '{a}{c}{r'}{i}{r'}'
            '{a}{c}{r'}{ye}{c}'
            '{a}{c}{r'}{i}{n}{q}'
            '{a}{c}{r'}{i}{q}'
            '{a}{c}{r'}{i}{n}'
            '{ye}{c}{i}{q}'
            '{a}{c}{i}{q}'
            '{ye}{c}{i}{n}'
            '{a}{c}{i}{n}'
            '{a}{c}{a}{r'}'
            '{a}{c}{a}{v}'
            '{a}{c}{a}{n}{q}'
            '{a}{c}{a}{q}'
            '{a}{c}{a}{n}'
            '{v}{ye}{c}{i}'
            '{a}{c}{r'}{i}'
            '{ye}{c}{a}{r'}'
            '{ye}{c}{a}{v}'
            '{c}{a}{n}{q}'
            '{c}{a}{q}'
            '{c}{a}{n}'
            '{a}{c}{a}'
            '{a}{c}{i}'
            '{ye}{c}{a}'
            '{ch}{ye}{l}'
            '{ye}{c}{i}'
            '{a}{r'}'
            '{a}{v}'
            '{a}{n}{q}'
            '{a}{q}'
            '{a}{n}'
            '{a}{l}'
            '{ye}{l}'
            '{ye}{c}'
            '{a}{c}'
            '{v}{ye}'
            '{a}'

                (delete)
        )
    )

    define noun as (
        [substring] among (
            '{a}{ts}{vo}'
            '{a}{n}{a}{k}'
            '{a}{n}{o}{c}'
            '{a}{r'}{a}{n}'
            '{a}{r'}{q}'
            '{p}{a}{n}'
            '{s}{t}{a}{n}'
            '{ye}{gh}{e}{n}'
            '{ye}{n}{q}'
            '{i}{k}'
            '{i}{ch}'
            '{i}{q}'
            '{m}{vo}{u}{n}{q}'
            '{j}{a}{k}'
            '{j}{vo}{u}{n}'
            '{vo}{n}{q}'
            '{vo}{r'}{d}'
            '{vo}{c}'
            '{ch}{ye}{q}'
            '{v}{a}{ts}{q}'
            '{v}{vo}{r'}'
            '{a}{v}{vo}{r'}'
            '{vo}{u}{dt}{j}{vo}{u}{n}'
            '{vo}{u}{k}'
            '{vo}{u}{h}{i}'
            '{vo}{u}{j}{dt}'
            '{vo}{u}{j}{q}'
            '{vo}{u}{s}{t}'
            '{vo}{u}{s}'
            '{c}{i}'
            '{a}{l}{i}{q}'
            '{a}{n}{i}{q}'
            '{i}{l}'
            '{i}{ch}{q}'
            '{vo}{u}{n}{q}'
            '{g}{a}{r'}'
            '{vo}{u}'
            '{a}{k}'
            '{a}{n}'
            '{q}'

                (delete)
        )
    )

    define ending as (
        [substring] R2 among (
            '{n}{ye}{r'}{y}'
            '{n}{ye}{r'}{n}'
            '{n}{ye}{r'}{i}'
            '{n}{ye}{r'}{d}'
            '{ye}{r'}{i}{c}'
            '{n}{ye}{r'}{i}{c}'
            '{ye}{r'}{i}'
            '{ye}{r'}{d}'
            '{ye}{r'}{n}'
            '{ye}{r'}{y}'
            '{n}{ye}{r'}{i}{n}'
            '{vo}{u}{dt}{j}{a}{n}{n}'
            '{vo}{u}{dt}{j}{a}{n}{y}'
            '{vo}{u}{dt}{j}{a}{n}{s}'
            '{vo}{u}{dt}{j}{a}{n}{d}'
            '{vo}{u}{dt}{j}{a}{n}'
            '{ye}{r'}{i}{n}'
            '{i}{n}'
            '{s}{a}'
            '{vo}{dj}'
            '{i}{c}'
            '{ye}{r'}{vo}{v}'
            '{n}{ye}{r'}{vo}{v}'
            '{ye}{r'}{vo}{u}{m}'
            '{n}{ye}{r'}{vo}{u}{m}'
            '{vo}{u}{n}'
            '{vo}{u}{d}'
            '{v}{a}{n}{s}'
            '{v}{a}{n}{y}'
            '{v}{a}{n}{d}'
            '{a}{n}{y}'
            '{a}{n}{d}'
            '{v}{a}{n}'
            '{vo}{dj}{y}'
            '{vo}{dj}{s}'
            '{vo}{dj}{d}'
            '{vo}{c}'
            '{vo}{u}{c}'
            '{vo}{dj}{i}{c}'
            '{c}{i}{c}'
            '{v}{i}{c}'
            '{v}{i}'
            '{v}{vo}{v}'
            '{vo}{v}'
            '{a}{n}{vo}{v}'
            '{a}{n}{vo}{u}{m}'
            '{v}{a}{n}{i}{c}'
            '{a}{m}{b}'
            '{a}{n}'
            '{n}{ye}{r'}'
            '{ye}{r'}'
            '{v}{a}'
            '{y}'
            '{n}'
            '{d}'
            '{c}'
            '{i}'

                (delete)
        )
    )
)

define stem as (

    do mark_regions
    backwards setlimit tomark pV for (
        do ending
        do verb
        do adjective
        do noun
    )
)