.doc
file)
The Turkish stemming algorithm was provided by Evren Kapusuz Cilden. It stems only noun and nominal verb suffixes because noun stems are more important for information retrieval, and only handling these simplifies the algorithm significantly.
In her paper (linked above) Evren explains
The stemmer can be enhanced to stem all kinds of verb suffixes. In Turkish, there are over fifty suffixes that can be affixed to verbs [2]. The morphological structure of verb suffixes is more complicated than noun suffixes. Despite this, one can use the methodology presented in this paper to enhance the stemmer to find stems of all kinds of Turkish words.
where [2] is a reference to the following paper:
Gulsen Eryigit and Esref Adali. An Affix Stripping Morphological Analyzer for Turkish Proceedings of the IAESTED International Conference ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, Innsbruck, Austria.
In addition to the steps described in the paper, as a first step if the input contains an apostrophe we truncate it at the first apostrophe, which aims to remove proper noun suffixes. For example, Türkiye'dir ("it is Turkey") is truncate to Türkiye ("Turkey") which is then stemmed as it would be without the proper noun suffix. (This step was added in Snowball 2.3.0.)
/* Stemmer for Turkish
* author: Evren (Kapusuz) Çilden
* email: evren.kapusuz at gmail.com
* version: 1.0 (15.01.2007)
* stems nominal verb suffixes
* stems nominal inflections
* more than one syllable word check
* (y,n,s,U) context check
* vowel harmony check
* last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
* The stemming algorithm is based on the paper "An Affix Stripping
* Morphological Analyzer for Turkish" by Gülşen Eryiğit and
* Eşref Adalı (Proceedings of the IAESTED International Conference
* ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
* Innsbruck, Austria
* Turkish is an agglutinative language and has a very rich morphological
* structure. In Turkish, you can form many different words from a single stem
* by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
* "You had been the doctor of him". The stem of the word is "doktor" and it
* takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
* the append order of suffixes can be clearly described as FSMs.
* The paper referenced above defines some FSMs for right to left
* morphological analysis. I generated a method for constructing snowball
* expressions from right to left FSMs for stemming suffixes.
*/
routines (
append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
check_vowel_harmony // tests vowel harmony for suffixes
is_reserved_word // tests whether current string is a reserved word ('ad','soyad')
mark_cAsInA // nominal verb suffix
mark_DA // noun suffix
mark_DAn // noun suffix
mark_DUr // nominal verb suffix
mark_ki // noun suffix
mark_lAr // noun suffix, nominal verb suffix
mark_lArI // noun suffix
mark_nA // noun suffix
mark_ncA // noun suffix
mark_ndA // noun suffix
mark_ndAn // noun suffix
mark_nU // noun suffix
mark_nUn // noun suffix
mark_nUz // nominal verb suffix
mark_sU // noun suffix
mark_sUn // nominal verb suffix
mark_sUnUz // nominal verb suffix
mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz,
mark_yA // noun suffix
mark_ylA // noun suffix
mark_yU // noun suffix
mark_yUm // nominal verb suffix
mark_yUz // nominal verb suffix
mark_yDU // nominal verb suffix
mark_yken // nominal verb suffix
mark_ymUs_ // nominal verb suffix
mark_ysA // nominal verb suffix
mark_suffix_with_optional_y_consonant
mark_suffix_with_optional_U_vowel
mark_suffix_with_optional_n_consonant
mark_suffix_with_optional_s_consonant
more_than_one_syllable_word
post_process_last_consonants
postlude
stem_nominal_verb_suffixes
stem_noun_suffixes
stem_suffix_chain_before_ki
)
stringescapes { }
/* Special characters in Unicode Latin-1 and Latin Extended-A */
stringdef cc '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA
stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE
stringdef i '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT
stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS
stringdef sc '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA
stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS
booleans ( continue_stemming_noun_suffixes )
groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)
define vowel 'ae{i}io{o"}u{u"}'
define U '{i}iu{u"}'
// the vowel grouping definitions below are used for checking vowel harmony
define vowel1 'a{i}ou' // vowels that can end with suffixes containing 'a'
define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
define vowel3 'a{i}' // vowels that can end with suffixes containing 'i''
define vowel4 'ei' // vowels that can end with suffixes containing 'i'
define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u'
define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"'
externals ( stem )
backwardmode (
// checks vowel harmony for possible suffixes,
// helps to detect whether the candidate for suffix applies to vowel harmony
// this rule is added to prevent over stemming
define check_vowel_harmony as (
test
(
(goto vowel) // if there is a vowel
(
('a' goto vowel1) or
('e' goto vowel2) or
('{i}' goto vowel3) or
('i' goto vowel4) or
('o' goto vowel5) or
('{o"}' goto vowel6) or
('u' goto vowel5) or
('{u"}' goto vowel6)
)
)
)
// if the last consonant before suffix is vowel and n then advance and delete
// if the last consonant before suffix is non vowel and n do nothing
// if the last consonant before suffix is not n then only delete the suffix
// assumption: slice beginning is set correctly
define mark_suffix_with_optional_n_consonant as (
('n' (test vowel))
or
((not(test 'n')) test(next vowel))
)
// if the last consonant before suffix is vowel and s then advance and delete
// if the last consonant before suffix is non vowel and s do nothing
// if the last consonant before suffix is not s then only delete the suffix
// assumption: slice beginning is set correctly
define mark_suffix_with_optional_s_consonant as (
('s' (test vowel))
or
((not(test 's')) test(next vowel))
)
// if the last consonant before suffix is vowel and y then advance and delete
// if the last consonant before suffix is non vowel and y do nothing
// if the last consonant before suffix is not y then only delete the suffix
// assumption: slice beginning is set correctly
define mark_suffix_with_optional_y_consonant as (
('y' (test vowel))
or
((not(test 'y')) test(next vowel))
)
define mark_suffix_with_optional_U_vowel as (
(U (test non-vowel))
or
((not(test U)) test(next non-vowel))
)
define mark_possessives as (
among ('m{i}z' 'miz' 'muz' 'm{u"}z'
'n{i}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
(mark_suffix_with_optional_U_vowel)
)
define mark_sU as (
check_vowel_harmony
U
(mark_suffix_with_optional_s_consonant)
)
define mark_lArI as (
among ('leri' 'lar{i}')
)
define mark_yU as (
check_vowel_harmony
U
(mark_suffix_with_optional_y_consonant)
)
define mark_nU as (
check_vowel_harmony
among ('n{i}' 'ni' 'nu' 'n{u"}')
)
define mark_nUn as (
check_vowel_harmony
among ('{i}n' 'in' 'un' '{u"}n')
(mark_suffix_with_optional_n_consonant)
)
define mark_yA as (
check_vowel_harmony
among('a' 'e')
(mark_suffix_with_optional_y_consonant)
)
define mark_nA as (
check_vowel_harmony
among('na' 'ne')
)
define mark_DA as (
check_vowel_harmony
among('da' 'de' 'ta' 'te')
)
define mark_ndA as (
check_vowel_harmony
among('nda' 'nde')
)
define mark_DAn as (
check_vowel_harmony
among('dan' 'den' 'tan' 'ten')
)
define mark_ndAn as (
check_vowel_harmony
among('ndan' 'nden')
)
define mark_ylA as (
check_vowel_harmony
among('la' 'le')
(mark_suffix_with_optional_y_consonant)
)
define mark_ki as (
'ki'
)
define mark_ncA as (
check_vowel_harmony
among('ca' 'ce')
(mark_suffix_with_optional_n_consonant)
)
define mark_yUm as (
check_vowel_harmony
among ('{i}m' 'im' 'um' '{u"}m')
(mark_suffix_with_optional_y_consonant)
)
define mark_sUn as (
check_vowel_harmony
among ('s{i}n' 'sin' 'sun' 's{u"}n' )
)
define mark_yUz as (
check_vowel_harmony
among ('{i}z' 'iz' 'uz' '{u"}z')
(mark_suffix_with_optional_y_consonant)
)
define mark_sUnUz as (
among ('s{i}n{i}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
)
define mark_lAr as (
check_vowel_harmony
among ('ler' 'lar')
)
define mark_nUz as (
check_vowel_harmony
among ('n{i}z' 'niz' 'nuz' 'n{u"}z')
)
define mark_DUr as (
check_vowel_harmony
among ('t{i}r' 'tir' 'tur' 't{u"}r' 'd{i}r' 'dir' 'dur' 'd{u"}r')
)
define mark_cAsInA as (
among ('cas{i}na' 'cesine')
)
define mark_yDU as (
check_vowel_harmony
among ('t{i}m' 'tim' 'tum' 't{u"}m' 'd{i}m' 'dim' 'dum' 'd{u"}m'
't{i}n' 'tin' 'tun' 't{u"}n' 'd{i}n' 'din' 'dun' 'd{u"}n'
't{i}k' 'tik' 'tuk' 't{u"}k' 'd{i}k' 'dik' 'duk' 'd{u"}k'
't{i}' 'ti' 'tu' 't{u"}' 'd{i}' 'di' 'du' 'd{u"}')
(mark_suffix_with_optional_y_consonant)
)
// does not fully obey vowel harmony
define mark_ysA as (
among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
(mark_suffix_with_optional_y_consonant)
)
define mark_ymUs_ as (
check_vowel_harmony
among ('m{i}{sc}' 'mi{sc}' 'mu{sc}' 'm{u"}{sc}')
(mark_suffix_with_optional_y_consonant)
)
define mark_yken as (
'ken' (mark_suffix_with_optional_y_consonant)
)
define stem_nominal_verb_suffixes as (
[
set continue_stemming_noun_suffixes
(mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
or
(mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
or
(
mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
unset continue_stemming_noun_suffixes
)
or
(mark_nUz (mark_yDU or mark_ysA))
or
((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
or
(mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
]delete
)
// stems noun suffix chains ending with -ki
define stem_suffix_chain_before_ki as (
[
mark_ki
(
(mark_DA] delete try([
(mark_lAr] delete try(stem_suffix_chain_before_ki))
or
(mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
))
or
(mark_nUn] delete try([
(mark_lArI] delete)
or
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
or
(stem_suffix_chain_before_ki)
))
or
(mark_ndA (
(mark_lArI] delete)
or
((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
or
(stem_suffix_chain_before_ki)
))
)
)
define stem_noun_suffixes as (
([mark_lAr] delete try(stem_suffix_chain_before_ki))
or
([mark_ncA] delete
try(
([mark_lArI] delete)
or
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
or
([mark_lAr] delete stem_suffix_chain_before_ki)
)
)
or
([(mark_ndA or mark_nA)
(
(mark_lArI] delete)
or
(mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
or
(stem_suffix_chain_before_ki)
)
)
or
([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
or
( [mark_DAn] delete try ([
(
(mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
or
(mark_lAr] delete try(stem_suffix_chain_before_ki))
or
(stem_suffix_chain_before_ki)
))
)
or
([mark_nUn or mark_ylA] delete
try(
([mark_lAr] delete stem_suffix_chain_before_ki)
or
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
or
stem_suffix_chain_before_ki
)
)
or
([mark_lArI] delete)
or
(stem_suffix_chain_before_ki)
or
([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
or
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
)
define post_process_last_consonants as (
[substring] among (
'b' (<- 'p')
'c' (<- '{cc}')
'd' (<- 't')
'{g~}' (<- 'k')
)
)
// after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
// like in 'kedim' -> 'ked'
// Turkish words don't usually end with 'd' or 'g'
// some very well known words are ignored (like 'ad' 'soyad'
// appends U to stems ending with d or g, decides which vowel to add
// based on the last vowel in the stem
define append_U_to_stems_ending_with_d_or_g as (
[] ('d' or 'g') goto vowel
(('a' or '{i}') <- '{i}')
or
(('e' or 'i') <- 'i')
or
(('o' or 'u') <- 'u')
or
(('{o"}' or '{u"}') <- '{u"}')
)
define is_reserved_word as (
'ad' try 'soy' atlimit
)
)
// Tests if there are more than one syllables
// In Turkish each vowel indicates a distinct syllable
define more_than_one_syllable_word as (
test (loop 2 gopast vowel)
)
define postlude as (
backwards (
not(is_reserved_word)
do append_U_to_stems_ending_with_d_or_g
do post_process_last_consonants
)
)
define stem as (
(more_than_one_syllable_word)
(
backwards (
do stem_nominal_verb_suffixes
continue_stemming_noun_suffixes
do stem_noun_suffixes
)
postlude
)
)