Here is a sample of Italian vocabulary, with the stemmed forms that will be generated by this algorithm:
| word | stem | word | stem | |||||||||
|
abbandonata abbandonate abbandonati abbandonato abbandonava abbandoneranno abbandonerà abbandonerò abbandono abbandonò abbaruffato abbassamento abbassando abbassandola abbassandole abbassar abbassare abbassarono abbassarsi abbassassero abbassato abbassava abbassi abbassò abbastanza abbattendo abbattere abbattersi abbattesse abbatteva abbattevamo abbattevano abbattimento abbattuta abbattuti abbattuto abbatté abbellita abbenché abbi |
⇒ |
abbandon abbandon abbandon abbandon abbandon abbandon abbandon abbandon abband abbandon abbaruff abbass abbass abbass abbass abbass abbass abbass abbass abbass abbass abbass abbass abbass abbast abbatt abbatt abbatt abbattess abbatt abbatt abbatt abbatt abbatt abbatt abbatt abbatt abbell abbenc abbi |
pronto pronuncerà pronuncia pronunciamento pronunciare pronunciarsi pronunciata pronunciate pronunciato pronunzia pronunziano pronunziare pronunziarle pronunziato pronunzio pronunziò propaga propagamento propaganda propagare propagarla propagarsi propagasse propagata propagazione propaghino propalate propende propensi propensione propini propio propizio propone proponendo proponendosi proponenti proponeva proponevano proponga |
⇒ |
pront pronunc pronunc pronunc pronunc pronunc pronunc pronunc pronunc pronunz pronunz pronunz pronunz pronunz pronunz pronunz propag propag propagand propag propag propag propag propag propag propaghin propal prop propens propension propin prop propiz propon propon propon proponent propon propon propong |
The elisions handled by the stemmer are chosen with improving information retrieval in mind. We therefore don't remove some elisions which are rare in practice, we don't remove elisions where both parts carry useful meaning (e.g. sant'antonio - "Saint Anthony"), and we don't remove elisions which are only used with words which don't carry much useful meaning in this context (e.g. c').
Italian can include the following accented forms:
First, replace all acute accents by grave accents. And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.)
The vowels are then
R2 (see the note on R1 and R2) and RV have the same definition as in the Spanish stemmer.
R2 is defined in the usual way — see the note on R1 and R2.
RV is defined as follows (this is the same as the Spanish stemmer definition, except for the initial exceptional case):
If the word begins divan then RV starts after this prefix. If the second letter is a consonant, RV is the region after the next following vowel, or if the first two letters are vowels, RV is the region after the next consonant, and otherwise (consonant-vowel case) RV is the region after the third letter. But RV is the end of the word if these positions cannot be found.
Before the numbered steps, we remove elisions. If the word starts with one of d l m s t v all dall dell gl nell quell quest sull tutt or un, followed by an apostrophe (') which is not at the end of the word, then remove from the prefix of the word up to and including this apostrophe.
Always do steps 0 and 1.
Step 0: Attached pronoun
following one of
in RV. In case of (a) the suffix is deleted, in case (b) it is replace by e (guardandogli → guardando, accomodarci → accomodare)
Step 1: Standard suffix removal
Do step 2 if no ending was removed by step 1.
Step 2: Verb suffixes
Always do steps 3a and 3b.
Step 3a
Step 3b
Finally,
routines (
elisions
prelude postlude mark_regions
RV R1 R2
attached_pronoun
standard_suffix
verb_suffix
vowel_suffix
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v AEIO CG )
stringescapes {}
/* special characters */
stringdef a' '{U+00E1}'
stringdef a` '{U+00E0}'
stringdef e' '{U+00E9}'
stringdef e` '{U+00E8}'
stringdef i' '{U+00ED}'
stringdef i` '{U+00EC}'
stringdef o' '{U+00F3}'
stringdef o` '{U+00F2}'
stringdef u' '{U+00FA}'
stringdef u` '{U+00F9}'
define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
define elisions as (
[ substring ] not atlimit among (
// 'c{'}' doesn't seem useful to remove here.
'd{'}' // e.g. d'Italia ("of Italy")
'l{'}' // e.g. l'anno ("the year")
'm{'}' // e.g. m'ama ("he loves me")
's{'}' // e.g. s'innamora ("he falls in love")
't{'}' // e.g. t'amo ("I love you")
'v{'}' // e.g. v'adoro ("I adore you")
'all{'}' // e.g. all'università ("at university")
'dall{'}' // e.g. dall'album ("from the album")
'dell{'}' // e.g. dell'anno ("of the year")
'gl{'}' // e.g. gl'inglesi ("the English")
'nell{'}' // e.g. nell'estate ("in the summer")
'quell{'}' // e.g. quell'anno ("that year")
'quest{'}' // e.g. quest'anno ("this year")
'sull{'}' // e.g. sull'isola ("on the island")
'tutt{'}' // e.g. tutt'Europa ("all of Europe")
'un{'}' // e.g. un'eccentricità ("an eccentricity")
)
delete
)
define prelude as (
test repeat (
[substring] among(
'{a'}' (<- '{a`}')
'{e'}' (<- '{e`}')
'{i'}' (<- '{i`}')
'{o'}' (<- '{o`}')
'{u'}' (<- '{u`}')
'qu' (<- 'qU')
'' (next)
)
)
repeat goto (
v [ ('u' ] v <- 'U') or
('i' ] v <- 'I')
)
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
'divan' // Otherwise "divano" stems to "div" and collides with "diva".
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'' (next)
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define attached_pronoun as (
[substring] among(
'ci' 'gli' 'la' 'le' 'li' 'lo'
'mi' 'ne' 'si' 'ti' 'vi'
// the compound forms are:
'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
'mela' 'mele' 'meli' 'melo' 'mene'
'tela' 'tele' 'teli' 'telo' 'tene'
'cela' 'cele' 'celi' 'celo' 'cene'
'vela' 'vele' 'veli' 'velo' 'vene'
)
substring RV among(
'ando' 'endo' (delete)
'ar' 'er' 'ir' (<- 'e')
)
)
define standard_suffix as (
[substring] among(
'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
'atrice' 'atrici'
'ante' 'anti'
( R2 delete )
'azione' 'azioni' 'atore' 'atori'
( R2 delete
try ( ['ic'] R2 delete )
)
'logia' 'logie'
( R2 <- 'log' )
'uzione' 'uzioni' 'usione' 'usioni'
( R2 <- 'u' )
'enza' 'enze'
( R2 <- 'ente' )
'amento' 'amenti' 'imento' 'imenti'
( RV delete )
'amente' (
R1 delete
try (
[substring] R2 delete among(
'iv' ( ['at'] R2 delete )
'os' 'ic' 'abil'
)
)
)
'it{a`}' (
R2 delete
try (
[substring] among(
'abil' 'ic' 'iv' (R2 delete)
)
)
)
'ivo' 'ivi' 'iva' 'ive' (
R2 delete
try ( ['at'] R2 delete ['ic'] R2 delete )
)
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among(
'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
'ono' 'uta' 'ute' 'uti' 'uto'
'ar' 'ir' // but 'er' is problematical
(delete)
)
)
define AEIO 'aeio{a`}{e`}{i`}{o`}'
define CG 'cg'
define vowel_suffix as (
try (
[AEIO] RV delete
['i'] RV delete
)
try (
['h'] CG RV delete
)
)
)
define stem as (
do elisions
do prelude
do mark_regions
backwards (
do attached_pronoun
do (standard_suffix or verb_suffix)
do vowel_suffix
)
do postlude
)