Here is a sample of Italian vocabulary, with the stemmed forms that will be generated by this algorithm:
word | stem | word | stem | |||||||||
abbandonata abbandonate abbandonati abbandonato abbandonava abbandonerà abbandoneranno abbandonerò abbandono abbandonò abbaruffato abbassamento abbassando abbassandola abbassandole abbassar abbassare abbassarono abbassarsi abbassassero abbassato abbassava abbassi abbassò abbastanza abbatté abbattendo abbattere abbattersi abbattesse abbatteva abbattevamo abbattevano abbattimento abbattuta abbattuti abbattuto abbellita abbenché abbi |
⇒ |
abbandon abbandon abbandon abbandon abbandon abbandon abbandon abbandon abband abbandon abbaruff abbass abbass abbass abbass abbass abbass abbass abbass abbass abbass abbass abbass abbass abbast abbatt abbatt abbatt abbatt abbattess abbatt abbatt abbatt abbatt abbatt abbatt abbatt abbell abbenc abbi |
pronto pronuncerà pronuncia pronunciamento pronunciare pronunciarsi pronunciata pronunciate pronunciato pronunzia pronunziano pronunziare pronunziarle pronunziato pronunzio pronunziò propaga propagamento propaganda propagare propagarla propagarsi propagasse propagata propagazione propaghino propalate propende propensi propensione propini propio propizio propone proponendo proponendosi proponenti proponeva proponevano proponga |
⇒ |
pront pronunc pronunc pronunc pronunc pronunc pronunc pronunc pronunc pronunz pronunz pronunz pronunz pronunz pronunz pronunz propag propag propagand propag propag propag propag propag propag propaghin propal prop propens propension propin prop propiz propon propon propon proponent propon propon propong |
Italian can include the following accented forms:
First, replace all acute accents by grave accents. And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.)
The vowels are then
R2 (see the note on R1 and R2) and RV have the same definition as in the Spanish stemmer.
First exceptional cases are checked for. These need to match the whole word, and currently are:
If found then handle as described and that's it.
Otherwise always do steps 0 and 1.
Step 0: Attached pronoun
following one of
in RV. In case of (a) the suffix is deleted, in case (b) it is replace by e (guardandogli → guardando, accomodarci → accomodare)
Step 1: Standard suffix removal
Do step 2 if no ending was removed by step 1.
Step 2: Verb suffixes
Always do steps 3a and 3b.
Step 3a
Step 3b
Finally,
routines (
exceptions
prelude postlude mark_regions
RV R1 R2
attached_pronoun
standard_suffix
verb_suffix
vowel_suffix
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v AEIO CG )
stringescapes {}
/* special characters */
stringdef a' '{U+00E1}'
stringdef a` '{U+00E0}'
stringdef e' '{U+00E9}'
stringdef e` '{U+00E8}'
stringdef i' '{U+00ED}'
stringdef i` '{U+00EC}'
stringdef o' '{U+00F3}'
stringdef o` '{U+00F2}'
stringdef u' '{U+00FA}'
stringdef u` '{U+00F9}'
define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
define prelude as (
test repeat (
[substring] among(
'{a'}' (<- '{a`}')
'{e'}' (<- '{e`}')
'{i'}' (<- '{i`}')
'{o'}' (<- '{o`}')
'{u'}' (<- '{u`}')
'qu' (<- 'qU')
'' (next)
)
)
repeat goto (
v [ ('u' ] v <- 'U') or
('i' ] v <- 'I')
)
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'' (next)
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define attached_pronoun as (
[substring] among(
'ci' 'gli' 'la' 'le' 'li' 'lo'
'mi' 'ne' 'si' 'ti' 'vi'
// the compound forms are:
'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
'mela' 'mele' 'meli' 'melo' 'mene'
'tela' 'tele' 'teli' 'telo' 'tene'
'cela' 'cele' 'celi' 'celo' 'cene'
'vela' 'vele' 'veli' 'velo' 'vene'
)
among( (RV)
'ando' 'endo' (delete)
'ar' 'er' 'ir' (<- 'e')
)
)
define standard_suffix as (
[substring] among(
'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
'atrice' 'atrici'
'ante' 'anti' // Note 1
( R2 delete )
'azione' 'azioni' 'atore' 'atori'
( R2 delete
try ( ['ic'] R2 delete )
)
'logia' 'logie'
( R2 <- 'log' )
'uzione' 'uzioni' 'usione' 'usioni'
( R2 <- 'u' )
'enza' 'enze'
( R2 <- 'ente' )
'amento' 'amenti' 'imento' 'imenti'
( RV delete )
'amente' (
R1 delete
try (
[substring] R2 delete among(
'iv' ( ['at'] R2 delete )
'os' 'ic' 'abil'
)
)
)
'it{a`}' (
R2 delete
try (
[substring] among(
'abil' 'ic' 'iv' (R2 delete)
)
)
)
'ivo' 'ivi' 'iva' 'ive' (
R2 delete
try ( ['at'] R2 delete ['ic'] R2 delete )
)
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among(
'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
'ono' 'uta' 'ute' 'uti' 'uto'
'ar' 'ir' // but 'er' is problematical
(delete)
)
)
define AEIO 'aeio{a`}{e`}{i`}{o`}'
define CG 'cg'
define vowel_suffix as (
try (
[AEIO] RV delete
['i'] RV delete
)
try (
['h'] CG RV delete
)
)
)
define exceptions as (
['divano' atlimit ] <- 'divan' // Otherwise "divano" stems to "div" and collides with "diva"
)
define stem as (
exceptions or (
do prelude
do mark_regions
backwards (
do attached_pronoun
do (standard_suffix or verb_suffix)
do vowel_suffix
)
do postlude
)
)
/*
Note 1: additions of 15 Jun 2005
*/