Catalan stemming algorithm

Links to resources

Here is a sample of Catalan vocabulary, with the stemmed forms that will be generated by this algorithm:

word stem          word stem
abatuda
abatut
abatuts
abbàssida
abbàssides
abbé
abdalwadita
abdalwadites
abdicació
abdicar
abdicat
abdicà
abat
abat
abat
abbas
abbas
abb
abdalwad
abdalwad
abdic
abdic
abdic
abdic
gore
gorg
gorga
gorges
gorgs
goril
gorja
gorra
gorres
gosa
gosadia
gosar
gor
gorg
gorg
gorg
gorg
gor
gorj
gorr
gorr
gos
gosad
gos

Letters in Catalan include the following accented forms,

á   é   í   ó   ú   à   è   ì   ò   ù   ü   ï

The following letters are vowels:

a   e   i   o   u   á   é   í   ó   ú   ü   ï

The stemming algorithm

routines (
           cleaning mark_regions
           R1  R2
           attached_pronoun
           standard_suffix
           verb_suffix
           residual_suffix
)

externals ( stem )

integers ( p1 p2 )

groupings ( v )

stringescapes {}

/* special characters */

stringdef a'   '{U+00E1}'  // a-acute
stringdef a`   '{U+00E0}'  // a-grave
stringdef cc   '{U+00E7}'  // c-cedilla
stringdef e'   '{U+00E9}'  // e-acute
stringdef e`   '{U+00E8}'  // e-grave
stringdef i'   '{U+00ED}'  // i-acute
stringdef i`   '{U+00EC}'  // i-grave
stringdef i"   '{U+00EF}'  // i-diaeresis
stringdef o'   '{U+00F3}'  // o-acute
stringdef o`   '{U+00F2}'  // o-grave
stringdef u'   '{U+00FA}'  // u-acute
stringdef u"   '{U+00FC}'  // u-diaeresis
stringdef .    '{U+00B7}'   // - per l aggeminades

define v 'aeiou{a'}{a`}{e'}{e`}{i'}{i"}{o'}{o`}{u'}{u"}'

define mark_regions as (

    $p1 = limit
    $p2 = limit  // defaults

    do (
        gopast v gopast non-v setmark p1
        gopast v gopast non-v setmark p2
    )
)

define cleaning as repeat (
    [substring] among(
        '{a'}' (<- 'a')
        '{a`}' (<- 'a')
        '{e'}' (<- 'e')
        '{e`}' (<- 'e')
        '{i'}' (<- 'i')
        '{i`}' (<- 'i')
        '{o'}' (<- 'o')
        '{o`}' (<- 'o')
        '{u'}' (<- 'u')
        '{u"}' (<- 'u')
        '{i"}' (<- 'i')
        '{.}' (<- '.')
        ''     (next)
    )
)

backwardmode (

    define R1 as $p1 <= cursor
    define R2 as $p2 <= cursor

    define attached_pronoun as (
    [substring] among (
        '{'}s'  '{'}hi' '{'}ho' '{'}l' '{'}ls'
        '-ls' '-la' '-les' '-li'
        'vos' 'se'  'nos' '-nos' '-us' 'us'
        '{'}n' '{'}ns' '-n' '-ns'
        '{'}m' '-me' '-m'
        '-te' '{'}t'
        'li' 'lo' 'los'
        'me'  'sela' 'selo' 'selas' 'selos' 'le'
        'la' 'las' 'les' 'ens' 'ho' 'hi'
        (R1 delete)
    )
    )

    define standard_suffix as (
        [substring] among(
            'ar' 'atge' 'formes' 'icte' 'ictes'
            'ell' 'ells' 'ella'  '{e'}s' '{e`}s' 'esc' 'essa' 'et' 'ets' 'eta'
            'eres' 'eries' 'ers' 'ina' 'ines' 'able' 'ls'
            'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius'
            'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste'
            'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis'
            '{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{cc}a' 'nces' '{o'}' 'dor' 'all'
            'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu'
            '{o'}s' 'osa'  'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar'
            'itar' 'ables' 'adors' 'idores' 'idors'
            'adora' 'aci{o'}' 'doras' 'dur' 'dures' 'alleng{u"}es'
            'ant' 'ants' 'ancia' 'ancies' 'at{o`}ria' 'at{o`}ries' 'tori' 'toris'
            'ats' 'ions'  'ota' 'isam' 'ors' 'ora' 'ores' 'isament'
            'bilitat' 'bilitats' 'ivitat' 'ivitats' 'ari' 'aris' 'ionisme' 'ionista' 'ionistes'
            'ialista' 'ialistes' 'ialisme' 'ialismes' 'ud' 'uts' 'uds' 'encia' 'encies' '{e`}ncia' '{e`}ncies'
            '{i"}tat' '{i"}tats' 'atiu' 'atius' 'atives' 'ativa' 'ativitat' 'ativitats' 'ible' 'ibles'
            'assa' 'asses' 'assos'
             'ent' 'ents'
             '{i'}ssim' '{i'}ssima' '{i'}ssims' '{i'}ssimes' '{i`}ssem' '{i`}sseu' '{i`}ssin'
             'ims' 'ima' 'imes'
             'isme' 'ista' 'ismes' 'istes'
             'inia' 'inies' '{i'}inia' '{i'}nies' 'ita' 'ites' 'triu' 'trius'
             'oses' 'osos' 'ient' 'otes' 'ots'
            (R1 delete)
            'acions' 'ada' 'ades'
            (R2 delete)
            'log{i'}a' 'log{i'}es''logia' 'logies' 'logi' 'logis' 'l{o'}gica' 'l{o'}gics' 'l{o'}giques'
            (R2 <- 'log')
            'ic' 'ica' 'ics' 'iques'
            (R2 <- 'ic')
            'qu{i'}ssim' 'qu{i'}ssims' 'qu{i'}ssimes' 'qu{i'}ssima'
            (R1 <- 'c')
        )
    )

    define verb_suffix as (
        [substring] among(
            'ador' 'adora'  'adors' 'adores' 're' 'ie'
             'ent' 'ents' 'udes' 'ar{a`}' 'eren'
            'ar{a'}' 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
            'aria' 'arian' 'arien' 'aries' 'ar{a`}s'
            'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ara'
            'ar{e'}' 'ar{e'}s'
            'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
            'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
            'er{e'}' 'er' 'erau' 'erass'
            'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
            'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
            'ir{e'}' '{i'}rem' '{i'}reu' '{i'}eu'
            'ia' 'ies' '{i'}em' '{i`}eu' 'ien'
            'at' 'ut' 'uda' 'ava' 'aves' 'avem' '{a'}vem' '{a`}vem' '{a`}veu' '{a'}veu' 'aven' 'au' 'ats'
            'asseu' 'esseu' 'eresseu' '{a`}sseu' '{a`}ssem' '{a`}ssim' '{a`}ssiu'
            'essen' 'esses' 'assen' 'asses' 'assim' 'assiu'
            '{e'}ssen' '{e'}sseu'  '{e'}ssim' '{e'}ssiu' '{e'}ssem'
            '{i'}' 'ares' '{a`}rem' '{a`}reu' '{a`}ren'
            'ar{i'}em' 'ar{i'}eu'
            'areu' 'aren' 'ant' '{i"}m' '{i"}u'
            '{e'}s' '{i"}en' 'en' 'es' 'em' 'am' 'ams' '{i"}a' '{i"}es'
            'dre' 'eix' 'eixer' 'tzar' 'eixes' 'ides' '{i"}des' 'it' '{i"}t' '{i"}da'
            'aba' 'ada' 'ades' 'ida' '{i'}a' 'iera' 'ad' 'ed' 'its'
            'id' 'ids'  'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
            'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
            'ido' 'iendo' 'i{o'}' 'ar' 'ir' 'as'
            'ieu' 'ii' 'io' 'i{a`}'
            'ess' 'essin' 'essis'  'ass' 'assin' 'assis' 'essim' '{e`}ssim' '{e`}ssiu'
            'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
            'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
            'ierais'  'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
            'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' 'ques'
            '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
            'ira' 'iran' 'irem' 'iren' 'ires' 'ireu' 'iria' 'irien'
            'iries' 'ir{a`}' 'ir{a`}s' 'ir{e`}' 'ir{i`}em' 'ir{i`}eu'
            'isquen' 'iguem' 'igueu' 'esqui' 'esquin' 'esquis' 'eixi' 'eixin' 'eixis'
            'eixen' 'eixo' 'isin' 'isis'  'esques' 'sis' 'sin'
            'int' 'ir{i'}em' 'ir{i'}eu' 'isc' 'atges' 'esca' 'esquen'
            'issen' 'isses' 'issin' 'issis' 'isca' 'issiu' 'issim'
            '{i"}sc' '{i"}sca' '{i"}ssin' '{i'}ssiu' '{i'}ssim' '{i"}ssis' '{i"}guem' '{i"}gueu'
            '{i"}ra' '{i"}ren' '{i"}res'
            '{i"}squen' '{i"}sques' '{i"}ssen' '{i"}sses' '{i"}xo' '{i"}xen' '{i"}xes' '{i"}x'
            'ixo' 'ixen' 'ixes' 'ix' 'ixa' 'inin' 'inis' 'ini' 'ineu' 'itza' 'itzi' 'itzeu' 'itzis'
            'itzo' 'itz' 'itz{a`}' 'arem' 'in' '{a`}s' 'i{i"}' 'i{i"}n' 'i{i"}s'
                (R1 delete)
            'ando'
                (R2 delete)
        )
    )

    define residual_suffix as (
        [substring] among(
            'os' 'a' 'o' '{a'}' '{a`}' '{i'}' '{o'}' 'e' '{e'}' 'eu' 'iu'
            'is' 'i' 'ir'  's' '{i`}' 'itz' '{i"}' '{i"}n' '{i"}s' 'it'
            (R1 delete)
            'iqu'
            (R1 <- 'ic')
        )
    )
)

define stem as (
    do mark_regions
    backwards (
    do attached_pronoun
    do ( standard_suffix or
             verb_suffix
           )
        do residual_suffix
    )
    do cleaning
)

/*
     First works 2010/07/19
     First Grammatical Reviews: https://ca.wikipedia.org/wiki/Gram%C3%A0tica_del_catal%C3%A0
     Suffix list: https://ca.wikipedia.org/wiki/Llista_de_sufixos
     Irregular Verbs: https://ca.wikipedia.org/wiki/Flexi%C3%B3_verbal_del_catal%C3%A0
*/