-
Notifications
You must be signed in to change notification settings - Fork 2
/
Sed.hs
90 lines (90 loc) · 4.57 KB
/
Sed.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
module Sed where
import Text.Regex.Posix
import Text.CSV
import Char
import qualified Data.Map as Dct
import Data.List
import Data.Ord
import qualified Lev
--- util ---
(&) = flip (.)
dctCollapse xs k v = Dct.fromAscListWith
(++)
(sortBy (comparing fst) [(k x, [v x]) | x <- xs])
listExtract ns xs = extract ns xs 0
where extract [] _ _ = []
extract (n:ns) xs i =
xs !! (n - i) : extract ns (drop (n - i) xs) n
kross (xs,ys) = do x <- xs; y <- ys; return (x,y)
both f (x,y) = (f x, f y)
pairs [] = []
pairs (x:xs) = [(x,y) | y <- xs] ++ pairs xs
average l = sum l / fromIntegral (length l)
--- read CSV ---
segment = head & dropWhile isLower & segmentName
where segmentName s = seg++n where (seg,n,_) =
s =~ "[0-9]" :: (String,String,String)
features (title:ns) = (feature title, map read ns)
where feature s = feat where (_,_,feat) =
s =~ "[0-9]" :: (String,String,String)
groupWords csv = Dct.map (fillsegments . phones) words
where words = dctCollapse (tail csv) (head & takeWhile isLower) id
fillsegments = Dct.mapWithKey makesegment
phones l = Dct.map Dct.fromList (dctCollapse l segment features)
makesegment typ d =
let size = length . head . Dct.elems $ d
in d `Dct.union` Dct.map (replicate size) (featdict Dct.! init typ)
groupRegions regions words = Dct.map outermost regions
where outermost range = Dct.map inner words
where inner = Dct.map (Dct.map (listExtract (map ((-) 2) range)))
groupSedInGor = do
csv <- parseCSVFromFile "sed.csv"
case csv of
Left err -> error ("oh no:" ++ show err)
Right rows -> return $ groupRegions regions $ groupWords $ transpose rows
--- analysis ---
flatten = map (map Dct.elems . Dct.elems) . Dct.elems
analyse sed = Dct.fromList . zip edges . map (sedDistance avgregions) $ regions
where edges = pairs (Dct.keys sed)
regions = pairs (flatten sed)
avgregions = average (map sedAvgTotal regions)
featureSub seg1 seg2 = fromIntegral (Dct.size(seg1 `symmetric_difference` seg2))
+ sum (map abs (Dct.elems (Dct.intersectionWith (-) seg1 seg2)))
where symmetric_difference d e = Dct.union (e `Dct.difference` d)
(d `Dct.difference` e)
sedDistance avg = sum . map (sedLevenshtein avg) . uncurry zip
transposeWord word = transpose (map transposeSegment word)
where transposeSegment seg = map (Dct.fromList . zip (Dct.keys seg))
(transpose (Dct.elems seg))
sedLevenshtein a = average . map (levenshtein a) . kross . both transposeWord
levenshtein a (w1,w2) =
head $ Lev._levenshtein w1 w2 a (const a, const a, featureSub)
sedAvg :: (Ord k, Fractional a) => ([Dct.Map k [a]], [Dct.Map k [a]]) -> a
sedAvg = both (concat . transposeWord) & kross & map (uncurry featureSub) & average
sedAvgTotal (region1,region2) = average (map sedAvg (zip region1 region2)) / 2.0
--- data ---
featdict = Dct.fromList [("C", Dct.fromList [("GL",0.0), ("V",0.0), ("H",0.0),
("PV",0.0), ("L",0.0)]),
("V", Dct.fromList [("B",1.0), ("H",1.0),
("L",1.0), ("R",1.0)]),
("R", Dct.fromList [("MN",1.5), ("PL",1.0)]),
("MULT", Dct.fromList [("MULT", 1.0)]),
("VC", Dct.empty)]
regions :: Dct.Map String [Int]
regions = Dct.fromList [("ne", [2..11]++[17..23]),
("nw", [11..17]++[23..41]++[77..83]),
("yk", [41..75]),
-- ++[75..77] (Isle of Man isn't on GOR map)
("wm", [112..126]++[140..157]),
("em", [83..112]++[126..140]++[157..172]),
("ee", [172..191]++[217..238]),
("se", [206..217]++[262..279]++[302..315]),
("sw", [193..206]++[240..262]++[279..302]),
("ld", [238..240])]
test = [["", "applV1H", "applV1L", "applC1GL", "applV2", "catcV1H", "askMULT0MULT", "askV1H", "askV1B"],
["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"]]