Skip to content

Commit

Permalink
Try to preserve lexer rule ordering
Browse files Browse the repository at this point in the history
  • Loading branch information
lierdakil committed Aug 4, 2023
1 parent 71752b0 commit b0d8eae
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 39 deletions.
51 changes: 22 additions & 29 deletions lib/Lexer/Build.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ import qualified Data.Set as S
import Control.Monad.State
import qualified Data.IntMap as IM
import Data.List.NonEmpty (NonEmpty(..))
import qualified Data.List.NonEmpty as NE
import Data.Function (on)
import Data.List
import Data.Maybe
import Control.Arrow
Expand All @@ -24,17 +22,23 @@ import qualified Data.Text as T
import Utils
import Lexer.Types
import Data.Proxy
import Debug.Trace

makeLexer :: (LexerWriter lang, Monad m) => Proxy lang
-> Bool -> [Text] -> MyMonadT m [(FilePath,Text)]
makeLexer lang outputDebug input = do
defs <- liftEither . left T.lines $ mapM (fmap regex . scanLine) input
traceM $ show defs
let nfa = evalState (buildNFA defs) 0
dfa = simplifyDFA . nfaToDFA $ nfa
debug = if outputDebug
then (("nfa.gv", nfaToGraphviz nfa) :) . (("dfa.gv", dfaToGraphviz dfa) :)
else id
stList = map (second (second (sortCharPatterns . M.toList))) $ IM.toList dfa
stList :: [(IM.Key, (StateAttr, [(NonEmpty CharPattern, Int)]))]
stList = map (second (second (fmap (first snd) . M.toList))) $ IM.toList dfa
traceM $ show nfa
traceM $ show dfa
traceM $ show $ map (second (second M.toList)) $ IM.toList dfa

accSt <- catMaybes <$> mapM (\(f, (s, _)) -> fmap (f,) <$> isSingle f s) stList
let tokNames = nub $ mapMaybe (\(_, x) -> (, saType x) <$> saName x) accSt
Expand All @@ -53,41 +57,30 @@ makeLexer lang outputDebug input = do
showSD StateData{saName=Nothing, saNum=num}
= "<unnamed> (line " <> tshow num <> ")"

sortCharPatterns :: [(NonEmpty CharPattern, Int)] -> [(NonEmpty CharPattern, Int)]
sortCharPatterns = sortBy (cmp `on` fst)
where
cmp a b | bina && ainb = EQ
| bina = GT
| ainb = LT
| otherwise = compare (length a) (length b)
where ainb = contains' b a
bina = contains' a b
contains' a b = all (\bi -> any (`containsCR` bi) (NE.toList a)) (NE.toList b)

newState :: State Int Int
newState = state $ \s -> (s+1, s+1)

nonAcc :: M.Map (Maybe (NonEmpty CharPattern)) [Int]
-> (StateAttr, M.Map (Maybe (NonEmpty CharPattern)) [Int])
nonAcc :: M.Map (Maybe (Prio, NonEmpty CharPattern)) [Int]
-> (StateAttr, M.Map (Maybe (Prio, NonEmpty CharPattern)) [Int])
nonAcc = (,) S.empty

regex1ToNFASt :: RegexPatternSingle -> State Int NFA
regex1ToNFASt (PGroup ch) = do
regex1ToNFASt :: Prio -> RegexPatternSingle -> State Int NFA
regex1ToNFASt pr (PGroup ch) = do
startSt <- get
endSt <- newState
return $ IM.singleton startSt (nonAcc $ M.singleton (Just ch) [endSt])
regex1ToNFASt (PMaybe pat) = do
return $ IM.singleton startSt (nonAcc $ M.singleton (Just (pr, ch)) [endSt])
regex1ToNFASt pr (PMaybe pat) = do
s2 <- get
nfa <- regexToNFASt pat
nfa <- regexToNFASt pr pat
s3 <- get
return $ IM.insertWith mapUnion s2 (nonAcc $ M.singleton Nothing [s3]) nfa
regex1ToNFASt (PKleene pat) = regex1ToNFASt (PMaybe [PPositive pat])
regex1ToNFASt (PPositive pat) = do
regex1ToNFASt pr (PKleene pat) = regex1ToNFASt pr (PMaybe [PPositive pat])
regex1ToNFASt pr (PPositive pat) = do
s2 <- get
nfa <- regexToNFASt pat
nfa <- regexToNFASt pr pat
s3 <- get
return $ IM.insertWith mapUnion s3 (nonAcc $ M.singleton Nothing [s2]) nfa
regex1ToNFASt (PAlternative pats) = altNFA True $ map regexToNFASt pats
regex1ToNFASt pr (PAlternative pats) = altNFA True $ map (regexToNFASt pr) pats

altNFA :: Bool -> [State Int NFA] -> State Int NFA
altNFA mkEndNode mnfas = do
Expand All @@ -113,14 +106,14 @@ mapUnion :: Ord k =>
-> (StateAttr, M.Map k [a2])
mapUnion (a, x) (b, y) = (a <> b, M.unionWith (++) x y)

regexToNFASt :: RegexPattern -> State Int NFA
regexToNFASt = foldr
(\p -> (IM.unionWith mapUnion <$> regex1ToNFASt p <*>))
regexToNFASt :: Prio -> RegexPattern -> State Int NFA
regexToNFASt pr = foldr
(\p -> (IM.unionWith mapUnion <$> regex1ToNFASt pr p <*>))
(return IM.empty)

regexToNFA :: (Int, Maybe Text, Action, Type, Greediness) -> RegexPattern -> State Int NFA
regexToNFA (num, name, action, typ, greed) pat = do
res1 <- regexToNFASt pat
res1 <- regexToNFASt num pat
lastSt <- get
return $ IM.insertWith mapUnion lastSt (S.singleton (StateData num name action typ greed), M.empty) res1

Expand Down
33 changes: 23 additions & 10 deletions lib/Lexer/FA.hs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ module Lexer.FA (
, dfaToGraphviz
, StateData(..)
, containsCR
, Prio
) where

import Regex.Parse (Action, Type, CharPattern(..), Greediness(..))
Expand All @@ -30,8 +31,9 @@ import Utils

data StateData = StateData { saNum :: Int, saName :: Maybe Text, saAct :: Action, saType :: Type, saGreed :: Greediness } deriving (Show, Eq, Ord)
type StateAttr = S.Set StateData
type NFA = IM.IntMap (StateAttr, M.Map (Maybe (NonEmpty CharPattern)) [Int])
type DFA = IM.IntMap (StateAttr, M.Map (NonEmpty CharPattern) Int)
type Prio = Int
type NFA = IM.IntMap (StateAttr, M.Map (Maybe (Prio, NonEmpty CharPattern)) [Int])
type DFA = IM.IntMap (StateAttr, M.Map (Prio, NonEmpty CharPattern) Int)

nfaToGraphviz :: NFA -> Text
nfaToGraphviz fa = "digraph{rankdir=LR;" <> foldMap node l <> "}"
Expand All @@ -40,7 +42,7 @@ nfaToGraphviz fa = "digraph{rankdir=LR;" <> foldMap node l <> "}"
<> foldMap (trans i) (M.toList t)
where lbl = mapMaybe saName $ S.toList a
acc = if not $ null lbl then ", peripheries=2" else ""
trans i (c, ss) = foldMap (\s -> tshow i <> " -> " <> tshow s <> "[label=\""<> showCharPattern c<>"\"];") ss
trans i (c, ss) = foldMap (\s -> tshow i <> " -> " <> tshow s <> "[label=\""<> showCharPattern (fmap snd c) <>"\"];") ss

dfaToGraphviz :: DFA -> Text
dfaToGraphviz fa = "digraph{rankdir=LR;" <> foldMap node l <> "}"
Expand All @@ -49,7 +51,7 @@ dfaToGraphviz fa = "digraph{rankdir=LR;" <> foldMap node l <> "}"
<> foldMap (trans i) (M.toList t)
where lbl = mapMaybe saName $ S.toList a
acc = if not $ null lbl then ", peripheries=2" else ""
trans i (c, ss) = (\s -> tshow i <> " -> " <> tshow s <> "[label=\""<> showCharPattern (Just c)<>"\"];") ss
trans i ((_, c), ss) = (\s -> tshow i <> " -> " <> tshow s <> "[label=\""<> showCharPattern (Just c)<>"\"];") ss

showCharPattern :: Maybe (NonEmpty CharPattern) -> Text
showCharPattern Nothing = "ε"
Expand All @@ -61,6 +63,12 @@ showCharPattern (Just (x :| rest)) = foldMap show1 $ x : rest
show1 (CNot xs) = "~(" <> showCharPattern (Just xs) <> ")"
tshow' = T.replace "\\" "\\\\" . tshow

newtype Min a = Min { getMin :: a }
deriving (Eq, Ord)

instance Ord a => Semigroup (Min a) where
(<>) = min

nfaToDFASt :: NFA -> State (Int, IS.IntSet, [(Int, (StateAttr, IS.IntSet))]) DFA
nfaToDFASt nfa = do
(st, seen, unseen) <- get
Expand All @@ -84,25 +92,30 @@ nfaToDFASt nfa = do
u0 (a1, m1) (a2, m2) = (a1 <> a2, M.unionWith u1 m1 m2)
u1 a b = if a == b then a else error "Multiple-state transition in DFA"
moves = M.toList . enrich . M.unionsWith (<>) . map moves1 . IS.toList
moves1 :: IS.Key -> M.Map (Prio, NonEmpty CharPattern) IS.IntSet
moves1 st
| Just (_, trans) <- IM.lookup st nfa
= M.map IS.fromList . M.mapKeysMonotonic fromJust . M.delete Nothing $ trans
| otherwise = M.empty
-- this computes transitive closure of xs over `contains` defined below
-- using dfs; also removes duplication
enrich :: M.Map (Prio, NonEmpty CharPattern) IS.IntSet -> M.Map (Prio, NonEmpty CharPattern) IS.IntSet
enrich xs =
let ks = S.toList . S.fromList $ concatMap NE.toList $ M.keys xs
xs' = M.fromListWith (<>) [(k, v) | (k1, v) <- M.toList xs, k <- NE.toList k1]
let ks :: [(Prio, CharPattern)]
ks = S.toList . S.fromList $ concatMap (\(p, cs) -> (p,) <$> NE.toList cs) $ M.keys xs
xs' :: M.Map CharPattern IS.IntSet
xs' = M.fromListWith (<>) [(k, v) | (k1, v) <- M.toList xs, k <- NE.toList $ snd k1]
es = M.fromListWith (<>)
[ (k1, [k2]) | k1 <- ks, k2 <- ks, k1 /= k2, containsCR k2 k1 ]
getRow k = (S.singleton k, IS.unions $ map (xs' M.!) $ dfs es S.empty [k])
mergeKeys = swapMap . swapMap
[ (k1, [k2]) | k1 <- ks, k2 <- ks, k1 /= k2, containsCR (snd k2) (snd k1) ]
getRow :: (Prio, CharPattern) -> ((Prio, S.Set CharPattern), IS.IntSet)
getRow k@(p, cs) = ((p, S.singleton cs), IS.unions $ map ((xs' M.!) . snd) $ dfs es S.empty [k])
mergeKeys = M.mapKeysMonotonic (first getMin) . swapMap . swapMap . M.mapKeysMonotonic (first Min)
swapMap :: (Ord k, Ord v, Semigroup k) => M.Map k v -> M.Map v k
swapMap = M.fromListWith (<>) . map swap . M.toList
setToNE s = case S.toList s of
(x:xss) -> x NE.:| xss
_ -> error "empty option"
in M.mapKeysMonotonic setToNE $ mergeKeys $ M.fromListWith (<>) $ map getRow ks
in M.mapKeysMonotonic (second setToNE) $ mergeKeys $ M.fromListWith (<>) $ map getRow ks
dfs _ vis [] = S.toList vis
dfs g vis (x:xs)
| x `S.member` vis = dfs g vis xs
Expand Down

0 comments on commit b0d8eae

Please sign in to comment.