Here is a sample of Portuguese vocabulary, with the stemmed forms that will be generated by this algorithm:
word | stem | word | stem | |||||||||
boa boainain boas bôas boassu boataria boate boates boatos bob boba bobagem bobagens bobalhões bobear bobeira bobinho bobinhos bobo bobs boca bocadas bocadinho bocado bocaiúva boçal bocarra bocas bode bodoque body boeing boem boemia boêmio boêmios bogotá boi bóia boiando |
⇒ |
boa boainain boas bôas boassu boat boat boat boat bob bob bobag bobagens bobalhõ bob bobeir bobinh bobinh bob bobs boc boc bocadinh boc bocaiúv boçal bocarr boc bod bodoqu body boeing boem boem boêmi boêmi bogot boi bói boi |
quiabo quicaram quickly quieto quietos quilate quilates quilinhos quilo quilombo quilométricas quilométricos quilômetro quilômetros quilos química químicas químico químicos quimioterapia quimioterápicos quimono quincas quinhão quinhentos quinn quino quinta quintal quintana quintanilha quintão quintessência quintino quinto quintos quintuplicou quinze quinzena quiosque |
⇒ |
quiab quic quickly quiet quiet quilat quilat quilinh quil quilomb quilométr quilométr quilômetr quilômetr quil químic químic químic químic quimioterap quimioteráp quimon quinc quinhã quinhent quinn quin quint quintal quintan quintanilh quintã quintessent quintin quint quint quintuplic quinz quinzen quiosqu |
Letters in Portuguese include the following accented forms,
should be treated as a vowel followed by a consonant.
ã and õ are therefore replaced by a~ and o~ in the word, where ~ is a separate character to be treated as a consonant. And then —
R2 (see the note on R1 and R2) and RV have the same definition as in the Spanish stemmer.
Always do step 1.
Step 1: Standard suffix removal
Do step 2 if no ending was removed by step 1.
Step 2: Verb suffixes
Alternatively, if neither steps 1 nor 2 altered the word, do step 4
Step 4: Residual suffix
Always do step 5
Step 5:
If the word ends with one of
in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i).
Or if the word ends ç remove the cedilla
And finally:
routines (
prelude postlude mark_regions
RV R1 R2
standard_suffix
verb_suffix
residual_suffix
residual_form
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v )
stringescapes {}
/* special characters */
stringdef a' '{U+00E1}' // a-acute
stringdef a^ '{U+00E2}' // a-circumflex e.g. 'bota^nico
stringdef e' '{U+00E9}' // e-acute
stringdef e^ '{U+00EA}' // e-circumflex
stringdef i' '{U+00ED}' // i-acute
stringdef o^ '{U+00F4}' // o-circumflex
stringdef o' '{U+00F3}' // o-acute
stringdef u' '{U+00FA}' // u-acute
stringdef cc '{U+00E7}' // c-cedilla
stringdef a~ '{U+00E3}' // a-tilde
stringdef o~ '{U+00F5}' // o-tilde
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
define prelude as repeat (
[substring] among(
'{a~}' (<- 'a~')
'{o~}' (<- 'o~')
'' (next)
) //or next
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'a~' (<- '{a~}')
'o~' (<- '{o~}')
'' (next)
) //or next
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define standard_suffix as (
[substring] among(
'eza' 'ezas'
'ico' 'ica' 'icos' 'icas'
'ismo' 'ismos'
'{a'}vel'
'{i'}vel'
'ista' 'istas'
'oso' 'osa' 'osos' 'osas'
'amento' 'amentos'
'imento' 'imentos'
'adora' 'ador' 'a{cc}a~o'
'adoras' 'adores' 'a{cc}o~es' // no -ic test
'ante' 'antes' '{a^}ncia' // Note 1
(
R2 delete
)
'logia'
'logias'
(
R2 <- 'log'
)
'u{cc}a~o' 'u{cc}o~es'
(
R2 <- 'u'
)
'{e^}ncia' '{e^}ncias'
(
R2 <- 'ente'
)
'amente'
(
R1 delete
try (
[substring] R2 delete among(
'iv' (['at'] R2 delete)
'os'
'ic'
'ad'
)
)
)
'mente'
(
R2 delete
try (
[substring] among(
'ante' // Note 1
'avel'
'{i'}vel' (R2 delete)
)
)
)
'idade'
'idades'
(
R2 delete
try (
[substring] among(
'abil'
'ic'
'iv' (R2 delete)
)
)
)
'iva' 'ivo'
'ivas' 'ivos'
(
R2 delete
try (
['at'] R2 delete // but not a further ['ic'] R2 delete
)
)
'ira' 'iras'
(
RV 'e' // -eira -eiras usually non-verbal
<- 'ir'
)
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among(
'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
'{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
'{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
'{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
'{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
'ira' 'iras'
(delete)
)
)
define residual_suffix as (
[substring] among(
'os'
'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
( RV delete )
)
)
define residual_form as (
[substring] among(
'e' '{e'}' '{e^}'
( RV delete [('u'] test 'g') or
('i'] test 'c') RV delete )
'{cc}' (<-'c')
)
)
)
define stem as (
do prelude
do mark_regions
backwards (
do (
( ( standard_suffix or verb_suffix )
and do ( ['i'] test 'c' RV delete )
)
or residual_suffix
)
do residual_form
)
do postlude
)
/*
Note 1: additions of 15 Jun 2005
*/