forked from LeonieWeissweiler/CISTEM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Cistem.hs
105 lines (86 loc) · 3.67 KB
/
Cistem.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
{-# LANGUAGE OverloadedStrings, RecordWildCards #-}
module NLP.Stemmer.Cistem (stem,stemCaseInsensitive,Segmentation(..),segment',segment,segment'CaseInsensitive,segmentCaseInsensitive) where
import Data.Char
import Data.Monoid
import Data.Text as T
-- | Guess the word stem. This module uses the CISTEM algorithm, published by L. Weißweiler and A. Fraser in "Developing a Stemmer for German Based on a Comparative Analysis of Publicly Available Stemmers" (2017).
stem :: Text -> Text
stem t =
let firstUpper = isUpper (T.head t)
in postpare $ loop firstUpper $ prepare t
-- | A case insensitive variant. Use only if the text may be incorrectly upper case.
stemCaseInsensitive :: Text -> Text
stemCaseInsensitive t = postpare $ loop False $ prepare t
data Segmentation = Segmentation { segPrefix :: Text, segStem :: Text, segSuffix :: Text } deriving (Show,Eq)
-- | Split the word into a prefix, the stem and a suffix. In contrast to the `stem` function umlauts remain unchanged.
segment' :: Text -> Segmentation
segment' t =
let firstUpper = isUpper (T.head t)
lower = T.toLower t
prepared = segmentPrepare t
theStem = postpare $ loop firstUpper prepared
thePrefix | theStem `isPrefixOf` lower = ""
| "ge" `isPrefixOf` lower = "ge"
| otherwise = error ("segment' should be debugged; extracted stem: "++ unpack theStem)
theSuffix = T.drop (T.length thePrefix + T.length theStem) lower
in Segmentation thePrefix theStem theSuffix
-- | Split the word into stem and suffix. This is supposed to be compatible to the `segment` function from the reference implementation.
segment :: Text -> (Text,Text)
segment t =
let Segmentation{..} = segment' t
in (segPrefix<>segStem, segSuffix)
-- | A case insensitive variant. Use only if the text may be incorrectly upper case.
segmentCaseInsensitive :: Text -> (Text,Text)
segmentCaseInsensitive = segment . T.toLower
-- | A case insensitive variant. Use only if the text may be incorrectly upper case.
segment'CaseInsensitive :: Text -> Segmentation
segment'CaseInsensitive = segment' . T.toLower
loop u t | T.length t <= 3 = t
| (T.length t > 5) && (["em","er","nd"] `isSuffixOf'` t) = loop u (stripSuffix' ["em","er","nd"] t)
| not u && ("t" `isSuffixOf` t) = loop u (stripSuffix' ["t"] t)
| ["e","s","n"] `isSuffixOf'` t = loop u (stripSuffix' ["e","s","n"] t)
| otherwise = t
prepare :: Text -> Text
prepare =
replace "ü" "u" .
replace "ö" "o" .
replace "ä" "a" .
replxx .
replace "ß" "ss" .
segmentPrepare
segmentPrepare :: Text -> Text
segmentPrepare =
replace "sch" "$" .
replace "ie" "&" .
replace "ei" "%" .
replxx .
stripge .
T.toLower
postpare :: Text -> Text
postpare =
replace "%" "ei" .
replace "&" "ie" .
replace "$" "sch" .
replxxback
replxx :: Text -> Text
replxx = snd . mapAccumL f '\0'
where f prev curr | prev == curr = (curr,'*')
| otherwise = (curr,curr)
replxxback :: Text -> Text
replxxback = snd . mapAccumL f '\0'
where f prev '*' = (prev,prev)
f prev curr = (curr,curr)
stripge :: Text -> Text
stripge t | T.length t >= 6 =
case stripPrefix "ge" t of
Nothing -> t
Just t -> t
| otherwise = t
isSuffixOf' [] _ = False
isSuffixOf' (s:ss) t = (s `isSuffixOf` t) || (ss `isSuffixOf'` t)
stripSuffix' :: [Text] -> Text -> Text
stripSuffix' [] hay = hay
stripSuffix' (suff:ss) hay =
case stripSuffix suff hay of
Just t -> t
Nothing -> stripSuffix' ss hay