Turkish stemming algorithm

Links to resources

The Turkish stemming algorithm was provided by Evren Kapusuz Cilden. It stems only noun and nominal verb suffixes because noun stems are more important for information retrieval, and only handling these simplifies the algorithm significantly.

In her paper (linked above) Evren explains

The stemmer can be enhanced to stem all kinds of verb suffixes. In Turkish, there are over fifty suffixes that can be affixed to verbs [2]. The morphological structure of verb suffixes is more complicated than noun suffixes. Despite this, one can use the methodology presented in this paper to enhance the stemmer to find stems of all kinds of Turkish words.

where [2] is a reference to the following paper:

Gulsen Eryigit and Esref Adali. An Affix Stripping Morphological Analyzer for Turkish Proceedings of the IAESTED International Conference ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, Innsbruck, Austria.

The algorithm in Snowball

/* Stemmer for Turkish
	* author: Evren (Kapusuz) Çilden
	* email: evren.kapusuz at gmail.com
	* version: 1.0 (15.01.2007)


	* stems nominal verb suffixes
	* stems nominal inflections
	* more than one syllable word check
	* (y,n,s,U) context check
	* vowel harmony check
	* last consonant check and conversion (b, c, d, ğ to p, ç, t, k)

	* The stemming algorithm is based on the paper "An Affix Stripping
	* Morphological Analyzer for Turkish" by Gülşen Eryiğit and
	* Eşref Adalı (Proceedings of the IAESTED International Conference
	* ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
	* Innsbruck, Austria

	* Turkish is an agglutinative language and has a very rich morphological
	* structure. In Turkish, you can form many different words from a single stem
	* by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
	* "You had been the doctor of him". The stem of the word is "doktor" and it
	* takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
	* the append order of suffixes can be clearly described as FSMs.
	* The paper referenced above defines some FSMs for right to left
	* morphological analysis. I generated a method for constructing snowball
	* expressions from right to left FSMs for stemming suffixes.
*/

routines (
	append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
	check_vowel_harmony	// tests vowel harmony for suffixes
	is_reserved_word	// tests whether current string is a reserved word ('ad','soyad')
	mark_cAsInA		// nominal verb suffix
	mark_DA			// noun suffix
	mark_DAn		// noun suffix
	mark_DUr		// nominal verb suffix
	mark_ki			// noun suffix
	mark_lAr		// noun suffix, nominal verb suffix
	mark_lArI		// noun suffix
	mark_nA			// noun suffix
	mark_ncA		// noun suffix
	mark_ndA		// noun suffix
	mark_ndAn		// noun suffix
	mark_nU			// noun suffix
	mark_nUn		// noun suffix
	mark_nUz		// nominal verb suffix
	mark_sU			// noun suffix
	mark_sUn		// nominal verb suffix
	mark_sUnUz		// nominal verb suffix
	mark_possessives	// -(U)m,-(U)n,-(U)mUz,-(U)nUz,
	mark_yA			// noun suffix
	mark_ylA		// noun suffix
	mark_yU			// noun suffix
	mark_yUm		// nominal verb suffix
	mark_yUz		// nominal verb suffix
	mark_yDU		// nominal verb suffix
	mark_yken		// nominal verb suffix
	mark_ymUs_		// nominal verb suffix
	mark_ysA		// nominal verb suffix

	mark_suffix_with_optional_y_consonant
	mark_suffix_with_optional_U_vowel
	mark_suffix_with_optional_n_consonant
	mark_suffix_with_optional_s_consonant

	more_than_one_syllable_word

	post_process_last_consonants
	postlude

	stem_nominal_verb_suffixes
	stem_noun_suffixes
	stem_suffix_chain_before_ki
)

stringescapes	{ }

/* Special characters in Unicode Latin-1 and Latin Extended-A */
stringdef cc	'{U+00E7}'	// LATIN SMALL LETTER C WITH CEDILLA
stringdef g~	'{U+011F}'	// LATIN SMALL LETTER G WITH BREVE
stringdef i 	'{U+0131}'	// LATIN SMALL LETTER I WITHOUT DOT
stringdef o"	'{U+00F6}'	// LATIN SMALL LETTER O WITH DIAERESIS
stringdef sc	'{U+015F}'	// LATIN SMALL LETTER S WITH CEDILLA
stringdef u"	'{U+00FC}'	// LATIN SMALL LETTER U WITH DIAERESIS

booleans	( continue_stemming_noun_suffixes )

groupings	( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)

define vowel	'ae{i}io{o"}u{u"}'
define U	'{i}iu{u"}'

// the vowel grouping definitions below are used for checking vowel harmony
define vowel1	'a{i}ou'		// vowels that can end with suffixes containing 'a'
define vowel2	'ei{o"}{u"}'		// vowels that can end with suffixes containing 'e'
define vowel3	'a{i}'			// vowels that can end with suffixes containing 'i''
define vowel4	'ei'			// vowels that can end with suffixes containing 'i'
define vowel5	'ou'			// vowels that can end with suffixes containing 'o' or 'u'
define vowel6	'{o"}{u"}'		// vowels that can end with suffixes containing 'o"' or 'u"'

externals	( stem )

backwardmode (
	// checks vowel harmony for possible suffixes,
	// helps to detect whether the candidate for suffix applies to vowel harmony
	// this rule is added to prevent over stemming
	define check_vowel_harmony as (
		test
		(
			(goto vowel)   // if there is a vowel
			(
				('a' goto vowel1) or
				('e' goto vowel2) or
				('{i}' goto vowel3) or
				('i' goto vowel4) or
				('o' goto vowel5) or
				('{o"}' goto vowel6) or
				('u' goto vowel5) or
				('{u"}' goto vowel6)
			)
		)
	)

	// if the last consonant before suffix is vowel and n then advance and delete
	// if the last consonant before suffix is non vowel and n do nothing
	// if the last consonant before suffix is not n then only delete the suffix
	// assumption: slice beginning is set correctly
	define mark_suffix_with_optional_n_consonant as (
		('n' (test vowel))
		or
		((not(test 'n')) test(next vowel))

	)

	// if the last consonant before suffix is vowel and s then advance and delete
	// if the last consonant before suffix is non vowel and s do nothing
	// if the last consonant before suffix is not s then only delete the suffix
	// assumption: slice beginning is set correctly
	define mark_suffix_with_optional_s_consonant as (
		('s' (test vowel))
		or
		((not(test 's')) test(next vowel))
	)

	// if the last consonant before suffix is vowel and y then advance and delete
	// if the last consonant before suffix is non vowel and y do nothing
	// if the last consonant before suffix is not y then only delete the suffix
	// assumption: slice beginning is set correctly
	define mark_suffix_with_optional_y_consonant as (
		('y' (test vowel))
		or
		((not(test 'y')) test(next vowel))
	)

	define mark_suffix_with_optional_U_vowel as (
		(U (test non-vowel))
		or
		((not(test U)) test(next non-vowel))

	)

	define mark_possessives as (
		among ('m{i}z' 'miz' 'muz' 'm{u"}z'
		       'n{i}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
		(mark_suffix_with_optional_U_vowel)
	)

	define mark_sU as (
		check_vowel_harmony
		U
		(mark_suffix_with_optional_s_consonant)
	)

	define mark_lArI as (
		among ('leri' 'lar{i}')
	)

	define mark_yU as (
		check_vowel_harmony
		U
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_nU as (
		check_vowel_harmony
		among ('n{i}' 'ni' 'nu' 'n{u"}')
	)

	define mark_nUn as (
		check_vowel_harmony
		among ('{i}n' 'in' 'un' '{u"}n')
		(mark_suffix_with_optional_n_consonant)
	)

	define mark_yA as (
		check_vowel_harmony
		among('a' 'e')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_nA as (
		check_vowel_harmony
		among('na' 'ne')
	)

	define mark_DA as (
		check_vowel_harmony
		among('da' 'de' 'ta' 'te')
	)

	define mark_ndA as (
		check_vowel_harmony
		among('nda' 'nde')
	)

	define mark_DAn as (
		check_vowel_harmony
		among('dan' 'den' 'tan' 'ten')
	)

	define mark_ndAn as (
		check_vowel_harmony
		among('ndan' 'nden')
	)

	define mark_ylA as (
		check_vowel_harmony
		among('la' 'le')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_ki as (
		'ki'
	)

	define mark_ncA as (
		check_vowel_harmony
		among('ca' 'ce')
		(mark_suffix_with_optional_n_consonant)
	)

	define mark_yUm as (
		check_vowel_harmony
		among ('{i}m' 'im' 'um' '{u"}m')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_sUn as (
		check_vowel_harmony
		among ('s{i}n' 'sin' 'sun' 's{u"}n' )
	)

	define mark_yUz as (
		check_vowel_harmony
		among ('{i}z' 'iz' 'uz' '{u"}z')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_sUnUz as (
		among ('s{i}n{i}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
	)

	define mark_lAr as (
		check_vowel_harmony
		among ('ler' 'lar')
	)

	define mark_nUz as (
		check_vowel_harmony
		among ('n{i}z' 'niz' 'nuz' 'n{u"}z')
	)

	define mark_DUr as (
		check_vowel_harmony
		among ('t{i}r' 'tir' 'tur' 't{u"}r' 'd{i}r' 'dir' 'dur' 'd{u"}r')
	)

	define mark_cAsInA as (
		among ('cas{i}na' 'cesine')
	)

	define mark_yDU as (
		check_vowel_harmony
		among ('t{i}m' 'tim' 'tum' 't{u"}m' 'd{i}m' 'dim' 'dum' 'd{u"}m'
			't{i}n' 'tin' 'tun' 't{u"}n' 'd{i}n' 'din' 'dun' 'd{u"}n'
			't{i}k' 'tik' 'tuk' 't{u"}k' 'd{i}k' 'dik' 'duk' 'd{u"}k'
			't{i}' 'ti' 'tu' 't{u"}' 'd{i}' 'di' 'du' 'd{u"}')
		(mark_suffix_with_optional_y_consonant)
	)

	// does not fully obey vowel harmony
	define mark_ysA as (
		among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_ymUs_ as (
		check_vowel_harmony
		among ('m{i}{sc}' 'mi{sc}' 'mu{sc}' 'm{u"}{sc}')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_yken as (
		'ken' (mark_suffix_with_optional_y_consonant)
	)

	define stem_nominal_verb_suffixes as (
		[
			set continue_stemming_noun_suffixes
			(mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
			or
			(mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
			or
			(
				mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
				unset continue_stemming_noun_suffixes
			)
			or
			(mark_nUz (mark_yDU or mark_ysA))
			or
			((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
			or
			(mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
		]delete
	)

	// stems noun suffix chains ending with -ki
	define stem_suffix_chain_before_ki as (
		[
			mark_ki
			(
				(mark_DA] delete try([
					(mark_lAr] delete try(stem_suffix_chain_before_ki))
					or
					(mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))

				))
				or
				(mark_nUn] delete try([
					(mark_lArI] delete)
					or
					([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
					or
					(stem_suffix_chain_before_ki)
				))
				or
				(mark_ndA (
					(mark_lArI] delete)
					or
					((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
					or
					(stem_suffix_chain_before_ki)
				))
			)
	)

	define stem_noun_suffixes as (
		([mark_lAr] delete try(stem_suffix_chain_before_ki))
		or
		([mark_ncA] delete
			try(
				([mark_lArI] delete)
				or
				([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
				or
				([mark_lAr] delete stem_suffix_chain_before_ki)
			)
		)
		or
		([(mark_ndA or mark_nA)
			(
				(mark_lArI] delete)
				or
				(mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
				or
				(stem_suffix_chain_before_ki)
			)
		)
		or
		([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
		or
		( [mark_DAn] delete try ([
			(
				(mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
				or
				(mark_lAr] delete try(stem_suffix_chain_before_ki))
				or
				(stem_suffix_chain_before_ki)
			))
		)
		or
		([mark_nUn or mark_ylA] delete
			try(
				([mark_lAr] delete stem_suffix_chain_before_ki)
				or
				([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
				or
				stem_suffix_chain_before_ki
			)
		)
		or
		([mark_lArI] delete)
		or
		(stem_suffix_chain_before_ki)
		or
		([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
		or
		([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
	)

	define post_process_last_consonants as (
		[substring] among (
			'b' (<- 'p')
			'c' (<- '{cc}')
			'd' (<- 't')
			'{g~}' (<- 'k')
		)
	)

	// after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
	// like in 'kedim' -> 'ked'
	// Turkish words don't usually end with 'd' or 'g'
	// some very well known words are ignored (like 'ad' 'soyad'
	// appends U to stems ending with d or g, decides which vowel to add
	// based on the last vowel in the stem
	define append_U_to_stems_ending_with_d_or_g as (
		[] ('d' or 'g') goto vowel

		(('a' or '{i}') <- '{i}')
		or
		(('e' or 'i') <- 'i')
		or
		(('o' or 'u') <- 'u')
		or
		(('{o"}' or '{u"}') <- '{u"}')
	)

	define is_reserved_word as (
		'ad' try 'soy' atlimit
	)
)

// Tests if there are more than one syllables
// In Turkish each vowel indicates a distinct syllable
define more_than_one_syllable_word as (
	test (loop 2 gopast vowel)
)

define postlude as (
	backwards (
		not(is_reserved_word)
		do append_U_to_stems_ending_with_d_or_g
		do post_process_last_consonants

	)
)

define stem as (
	(more_than_one_syllable_word)
	(
		backwards (
			do stem_nominal_verb_suffixes
			continue_stemming_noun_suffixes
			do stem_noun_suffixes
		)

	postlude
	)
)