Try to preserve lexer rule ordering

lierdakil · Aug 4, 2023 · b0d8eae · b0d8eae
1 parent 71752b0
commit b0d8eae
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 39 deletions.
diff --git a/lib/Lexer/Build.hs b/lib/Lexer/Build.hs
@@ -9,8 +9,6 @@ import qualified Data.Set as S
 import Control.Monad.State
 import qualified Data.IntMap as IM
 import Data.List.NonEmpty (NonEmpty(..))
-import qualified Data.List.NonEmpty as NE
-import Data.Function (on)
 import Data.List
 import Data.Maybe
 import Control.Arrow
@@ -24,17 +22,23 @@ import qualified Data.Text as T
 import Utils
 import Lexer.Types
 import Data.Proxy
+import Debug.Trace
 
 makeLexer :: (LexerWriter lang, Monad m) => Proxy lang
           -> Bool -> [Text] -> MyMonadT m [(FilePath,Text)]
 makeLexer lang outputDebug input = do
   defs <- liftEither . left T.lines $ mapM (fmap regex . scanLine) input
+  traceM $ show defs
   let nfa = evalState (buildNFA defs) 0
       dfa = simplifyDFA . nfaToDFA $ nfa
       debug = if outputDebug
         then (("nfa.gv", nfaToGraphviz nfa) :) . (("dfa.gv", dfaToGraphviz dfa) :)
         else id
-      stList = map (second (second (sortCharPatterns . M.toList))) $ IM.toList dfa
+      stList :: [(IM.Key, (StateAttr, [(NonEmpty CharPattern, Int)]))]
+      stList = map (second (second (fmap (first snd) . M.toList))) $ IM.toList dfa
+  traceM $ show nfa
+  traceM $ show dfa
+  traceM $ show $ map (second (second M.toList)) $ IM.toList dfa
 
   accSt <- catMaybes <$> mapM (\(f, (s, _)) -> fmap (f,) <$> isSingle f s) stList
   let tokNames = nub $ mapMaybe (\(_, x) -> (, saType x) <$> saName x) accSt
@@ -53,41 +57,30 @@ makeLexer lang outputDebug input = do
     showSD StateData{saName=Nothing, saNum=num}
       = "<unnamed> (line " <> tshow num <> ")"
 
-sortCharPatterns :: [(NonEmpty CharPattern, Int)] -> [(NonEmpty CharPattern, Int)]
-sortCharPatterns = sortBy (cmp `on` fst)
-  where
-    cmp a b | bina && ainb = EQ
-            | bina = GT
-            | ainb = LT
-            | otherwise = compare (length a) (length b)
-      where ainb = contains' b a
-            bina = contains' a b
-    contains' a b = all (\bi -> any (`containsCR` bi) (NE.toList a)) (NE.toList b)
-
 newState :: State Int Int
 newState = state $ \s -> (s+1, s+1)
 
-nonAcc :: M.Map (Maybe (NonEmpty CharPattern)) [Int]
-       -> (StateAttr, M.Map (Maybe (NonEmpty CharPattern)) [Int])
+nonAcc :: M.Map (Maybe (Prio, NonEmpty CharPattern)) [Int]
+       -> (StateAttr, M.Map (Maybe (Prio, NonEmpty CharPattern)) [Int])
 nonAcc = (,) S.empty
 
-regex1ToNFASt :: RegexPatternSingle -> State Int NFA
-regex1ToNFASt (PGroup ch) = do
+regex1ToNFASt :: Prio -> RegexPatternSingle -> State Int NFA
+regex1ToNFASt pr (PGroup ch) = do
   startSt <- get
   endSt <- newState
-  return $ IM.singleton startSt (nonAcc $ M.singleton (Just ch) [endSt])
-regex1ToNFASt (PMaybe pat) = do
+  return $ IM.singleton startSt (nonAcc $ M.singleton (Just (pr, ch)) [endSt])
+regex1ToNFASt pr (PMaybe pat) = do
   s2 <- get
-  nfa <- regexToNFASt pat
+  nfa <- regexToNFASt pr pat
   s3 <- get
   return $ IM.insertWith mapUnion s2 (nonAcc $ M.singleton Nothing [s3]) nfa
-regex1ToNFASt (PKleene pat) = regex1ToNFASt (PMaybe [PPositive pat])
-regex1ToNFASt (PPositive pat) = do
+regex1ToNFASt pr (PKleene pat) = regex1ToNFASt pr (PMaybe [PPositive pat])
+regex1ToNFASt pr (PPositive pat) = do
   s2 <- get
-  nfa <- regexToNFASt pat
+  nfa <- regexToNFASt pr pat
   s3 <- get
   return $ IM.insertWith mapUnion s3 (nonAcc $ M.singleton Nothing [s2]) nfa
-regex1ToNFASt (PAlternative pats) = altNFA True $ map regexToNFASt pats
+regex1ToNFASt pr (PAlternative pats) = altNFA True $ map (regexToNFASt pr) pats
 
 altNFA :: Bool -> [State Int NFA] -> State Int NFA
 altNFA mkEndNode mnfas = do
@@ -113,14 +106,14 @@ mapUnion :: Ord k =>
             -> (StateAttr, M.Map k [a2])
 mapUnion (a, x) (b, y) = (a <> b, M.unionWith (++) x y)
 
-regexToNFASt :: RegexPattern -> State Int NFA
-regexToNFASt = foldr
-  (\p -> (IM.unionWith mapUnion <$> regex1ToNFASt p <*>))
+regexToNFASt :: Prio -> RegexPattern -> State Int NFA
+regexToNFASt pr = foldr
+  (\p -> (IM.unionWith mapUnion <$> regex1ToNFASt pr p <*>))
   (return IM.empty)
 
 regexToNFA :: (Int, Maybe Text, Action, Type, Greediness) -> RegexPattern -> State Int NFA
 regexToNFA (num, name, action, typ, greed) pat = do
-  res1 <- regexToNFASt pat
+  res1 <- regexToNFASt num pat
   lastSt <- get
   return $ IM.insertWith mapUnion lastSt (S.singleton (StateData num name action typ greed), M.empty) res1
 

diff --git a/lib/Lexer/FA.hs b/lib/Lexer/FA.hs
@@ -10,6 +10,7 @@ module Lexer.FA (
   , dfaToGraphviz
   , StateData(..)
   , containsCR
+  , Prio
   ) where
 
 import Regex.Parse (Action, Type, CharPattern(..), Greediness(..))
@@ -30,8 +31,9 @@ import Utils
 
 data StateData = StateData { saNum :: Int, saName :: Maybe Text, saAct :: Action, saType :: Type, saGreed :: Greediness } deriving (Show, Eq, Ord)
 type StateAttr = S.Set StateData
-type NFA = IM.IntMap (StateAttr, M.Map (Maybe (NonEmpty CharPattern)) [Int])
-type DFA = IM.IntMap (StateAttr, M.Map (NonEmpty CharPattern) Int)
+type Prio = Int
+type NFA = IM.IntMap (StateAttr, M.Map (Maybe (Prio, NonEmpty CharPattern)) [Int])
+type DFA = IM.IntMap (StateAttr, M.Map (Prio, NonEmpty CharPattern) Int)
 
 nfaToGraphviz :: NFA -> Text
 nfaToGraphviz fa = "digraph{rankdir=LR;" <> foldMap node l <> "}"
@@ -40,7 +42,7 @@ nfaToGraphviz fa = "digraph{rankdir=LR;" <> foldMap node l <> "}"
                                   <> foldMap (trans i) (M.toList t)
                            where lbl = mapMaybe saName $ S.toList a
                                  acc = if not $ null lbl then ", peripheries=2" else ""
-        trans i (c, ss) = foldMap (\s -> tshow i <> " -> " <> tshow s <> "[label=\""<> showCharPattern c<>"\"];") ss
+        trans i (c, ss) = foldMap (\s -> tshow i <> " -> " <> tshow s <> "[label=\""<> showCharPattern (fmap snd c) <>"\"];") ss
 
 dfaToGraphviz :: DFA -> Text
 dfaToGraphviz fa = "digraph{rankdir=LR;" <> foldMap node l <> "}"
@@ -49,7 +51,7 @@ dfaToGraphviz fa = "digraph{rankdir=LR;" <> foldMap node l <> "}"
                                   <> foldMap (trans i) (M.toList t)
                            where lbl = mapMaybe saName $ S.toList a
                                  acc = if not $ null lbl then ", peripheries=2" else ""
-        trans i (c, ss) = (\s -> tshow i <> " -> " <> tshow s <> "[label=\""<> showCharPattern (Just c)<>"\"];") ss
+        trans i ((_, c), ss) = (\s -> tshow i <> " -> " <> tshow s <> "[label=\""<> showCharPattern (Just c)<>"\"];") ss
 
 showCharPattern :: Maybe (NonEmpty CharPattern) -> Text
 showCharPattern Nothing = "ε"
@@ -61,6 +63,12 @@ showCharPattern (Just (x :| rest)) = foldMap show1 $ x : rest
         show1 (CNot xs) = "~(" <> showCharPattern (Just xs) <> ")"
         tshow' = T.replace "\\" "\\\\" . tshow
 
+newtype Min a = Min { getMin :: a }
+  deriving (Eq, Ord)
+
+instance Ord a => Semigroup (Min a) where
+  (<>) = min
+
 nfaToDFASt :: NFA -> State (Int, IS.IntSet, [(Int, (StateAttr, IS.IntSet))]) DFA
 nfaToDFASt nfa = do
   (st, seen, unseen) <- get
@@ -84,25 +92,30 @@ nfaToDFASt nfa = do
   u0 (a1, m1) (a2, m2) = (a1 <> a2, M.unionWith u1 m1 m2)
   u1 a b = if a == b then a else error "Multiple-state transition in DFA"
   moves = M.toList . enrich . M.unionsWith (<>) . map moves1 . IS.toList
+  moves1 :: IS.Key -> M.Map (Prio, NonEmpty CharPattern) IS.IntSet
   moves1 st
     | Just (_, trans) <- IM.lookup st nfa
     = M.map IS.fromList . M.mapKeysMonotonic fromJust . M.delete Nothing $ trans
     | otherwise = M.empty
   -- this computes transitive closure of xs over `contains` defined below
   -- using dfs; also removes duplication
+  enrich :: M.Map (Prio, NonEmpty CharPattern) IS.IntSet -> M.Map (Prio, NonEmpty CharPattern) IS.IntSet
   enrich xs =
-    let ks = S.toList . S.fromList $ concatMap NE.toList $ M.keys xs
-        xs' = M.fromListWith (<>) [(k, v) | (k1, v) <- M.toList xs, k <- NE.toList k1]
+    let ks :: [(Prio, CharPattern)]
+        ks = S.toList . S.fromList $ concatMap (\(p, cs) -> (p,) <$> NE.toList cs) $ M.keys xs
+        xs' :: M.Map CharPattern IS.IntSet
+        xs' = M.fromListWith (<>) [(k, v) | (k1, v) <- M.toList xs, k <- NE.toList $ snd k1]
         es = M.fromListWith (<>)
-              [ (k1, [k2]) | k1 <- ks, k2 <- ks, k1 /= k2, containsCR k2 k1 ]
-        getRow k = (S.singleton k, IS.unions $ map (xs' M.!) $ dfs es S.empty [k])
-        mergeKeys = swapMap . swapMap
+              [ (k1, [k2]) | k1 <- ks, k2 <- ks, k1 /= k2, containsCR (snd k2) (snd k1) ]
+        getRow :: (Prio, CharPattern) -> ((Prio, S.Set CharPattern), IS.IntSet)
+        getRow k@(p, cs) = ((p, S.singleton cs), IS.unions $ map ((xs' M.!) . snd) $ dfs es S.empty [k])
+        mergeKeys = M.mapKeysMonotonic (first getMin) . swapMap . swapMap . M.mapKeysMonotonic (first Min)
         swapMap :: (Ord k, Ord v, Semigroup k) => M.Map k v -> M.Map v k
         swapMap = M.fromListWith (<>) . map swap . M.toList
         setToNE s = case S.toList s of
           (x:xss) -> x NE.:| xss
           _ -> error "empty option"
-    in M.mapKeysMonotonic setToNE $ mergeKeys $ M.fromListWith (<>) $ map getRow ks
+    in M.mapKeysMonotonic (second setToNE) $ mergeKeys $ M.fromListWith (<>) $ map getRow ks
   dfs _ vis [] = S.toList vis
   dfs g vis (x:xs)
     | x `S.member` vis = dfs g vis xs