Skip to content

Commit

Permalink
#8986 different behaviors based on type of meta
Browse files Browse the repository at this point in the history
  • Loading branch information
StephanMeijer committed Nov 6, 2023
1 parent 6ac7e0b commit d20c61b
Show file tree
Hide file tree
Showing 3 changed files with 207 additions and 34 deletions.
70 changes: 44 additions & 26 deletions src/Text/Pandoc/Readers/Docx.hs
Original file line number Diff line number Diff line change
Expand Up @@ -169,36 +169,54 @@ spansToKeep = []
divsToKeep :: [ParaStyleName]
divsToKeep = ["Definition", "Definition Term"]

metaStyles :: M.Map ParaStyleName T.Text
metaStyles = M.fromList [ ("Title", "title")
, ("Subtitle", "subtitle")
, ("Author", "author")
, ("Date", "date")
, ("Abstract", "abstract")]

sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart])
sepBodyParts bps = (metaWithoutEmpty, nonMetaFirst ++ emptyPars ++ nonMetaLast)
where
(nonMetaFirst, rest) = break isMetaOrEmpty bps
(meta, nonMetaLast) = span isMetaOrEmpty rest
isMetaOrEmpty bp = isMetaPar bp || isEmptyPar bp
multiMetaStyles :: M.Map ParaStyleName T.Text
multiMetaStyles = M.fromList [ ("Author", "author") ]

(metaWithoutEmpty, emptyPars) = partition (not . isEmptyPar) meta
-- | Meta Styles where just the first single instance is kept.
singleMetaStyles :: M.Map ParaStyleName T.Text
singleMetaStyles = M.fromList [ ("Title", "title")
, ("Subtitle", "subtitle")
, ("Date", "date")
, ("Abstract", "abstract")]

isMetaPar :: BodyPart -> Bool
isMetaPar (Paragraph pPr _) =
not $ null $ intersect (getStyleNames $ pStyle pPr) (M.keys metaStyles)
isMetaPar _ = False
metaStyles :: M.Map ParaStyleName T.Text
metaStyles = M.union singleMetaStyles multiMetaStyles

isEmptyPar :: BodyPart -> Bool
isEmptyPar (Paragraph _ parParts) =
all isEmptyParPart parParts
sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart])
sepBodyParts bps = (multiMetas ++ singleMetas, restWithoutRelevantMeta)
where
isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems
isEmptyParPart _ = False
isEmptyElem (TextRun s) = trim s == ""
isEmptyElem _ = True
isEmptyPar _ = False
-- extract all metas from bps only based on metaStyles
(multiMetas, restWithoutMulti) = partition isMultiMetaPar bps

-- extract the first of every in singleMetaStyles and add to singleMetas, remaining elements to rest
(singleMetas, restWithoutRelevantMeta) = foldr extractSingle ([], restWithoutMulti) (M.keys singleMetaStyles)

extractSingle :: ParaStyleName -> ([BodyPart], [BodyPart]) -> ([BodyPart], [BodyPart])
extractSingle styleName (accSingleMetas, remainingBPs) =
let (found, rest) = extractFirst (isSingleMetaPar styleName) remainingBPs
in (maybeToList found ++ accSingleMetas, rest)

maybeToList :: Maybe a -> [a]
maybeToList Nothing = []
maybeToList (Just x) = [x]

isSingleMetaPar :: ParaStyleName -> BodyPart -> Bool
isSingleMetaPar styleName (Paragraph pPr _) =
styleName `elem` getStyleNames (pStyle pPr)
isSingleMetaPar _ _ = False

extractFirst :: (a -> Bool) -> [a] -> (Maybe a, [a])
extractFirst _ [] = (Nothing, [])
extractFirst predicate (x:xs)
| predicate x = (Just x, xs)
| otherwise = let (found, rest) = extractFirst predicate xs
in (found, x : rest)


isMultiMetaPar :: BodyPart -> Bool
isMultiMetaPar (Paragraph pPr _) =
not $ null $ intersect (getStyleNames $ pStyle pPr) (M.keys multiMetaStyles)
isMultiMetaPar _ = False

bodyPartsToMeta' :: PandocMonad m => [BodyPart] -> DocxContext m (M.Map T.Text MetaValue)
bodyPartsToMeta' [] = return M.empty
Expand Down
2 changes: 1 addition & 1 deletion test/Tests/Readers/Docx.hs
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ tests = [ testGroup "document"
"docx/metadata.docx"
"docx/metadata.native"
, testCompareWithOpts def{readerStandalone=True}
"stop recording metadata with normal text"
"recording metadata after normal text only if author"
"docx/metadata_after_normal.docx"
"docx/metadata_after_normal.native"
]
Expand Down
169 changes: 162 additions & 7 deletions test/docx/metadata_after_normal.native
Original file line number Diff line number Diff line change
@@ -1,7 +1,162 @@
Pandoc (Meta {unMeta = fromList [("abstract",MetaInlines [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]),("author",MetaList [MetaInlines [Str "Mary",Space,Str "Ann",Space,Str "Evans"],MetaInlines [Str "Aurore",Space,Str "Dupin"]]),("date",MetaInlines [Str "July",Space,Str "28,",Space,Str "2014"]),("title",MetaInlines [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"])]})
[Para [Str "And",Space,Str "now",Space,Str "this",Space,Str "is",Space,Str "normal",Space,Str "text."]
,Para [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"]
,Para [Str "Mary",Space,Str "Ann",Space,Str "Evans"]
,Para [Str "Aurore",Space,Str "Dupin"]
,Para [Str "July",Space,Str "28,",Space,Str "2014"]
,Para [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]]
Pandoc
Meta
{ unMeta =
fromList
[ ( "abstract"
, MetaInlines
[ Str "This"
, Space
, Str "is"
, Space
, Str "a"
, Space
, Str "test"
, Space
, Str "of"
, Space
, Str "how"
, Space
, Str "this"
, Space
, Str "all"
, Space
, Str "works."
, Space
, Str "I\8217ve"
, Space
, Str "skipped"
, Space
, Str "lines"
, Space
, Str "here,"
, Space
, Str "which"
, Space
, Str "pandoc"
, Space
, Str "doesn\8217t"
, Space
, Str "do,"
, Space
, Str "but"
, Space
, Str "which"
, Space
, Str "shouldn\8217t"
, Space
, Str "make"
, Space
, Str "a"
, Space
, Str "difference."
]
)
, ( "author"
, MetaList
[ MetaInlines
[ Str "Mary"
, Space
, Str "Ann"
, Space
, Str "Evans"
]
, MetaInlines [ Str "Aurore" , Space , Str "Dupin" ]
, MetaInlines
[ Str "Mary"
, Space
, Str "Ann"
, Space
, Str "Evans"
]
, MetaInlines [ Str "Aurore" , Space , Str "Dupin" ]
]
)
, ( "date"
, MetaInlines
[ Str "July" , Space , Str "28," , Space , Str "2014" ]
)
, ( "title"
, MetaInlines
[ Str "This"
, Space
, Str "Is"
, Space
, Str "the"
, Space
, Str "Title"
]
)
]
}
[ Para
[ Str "And"
, Space
, Str "now"
, Space
, Str "this"
, Space
, Str "is"
, Space
, Str "normal"
, Space
, Str "text."
]
, Para
[ Str "This"
, Space
, Str "Is"
, Space
, Str "the"
, Space
, Str "Title"
]
, Para
[ Str "July" , Space , Str "28," , Space , Str "2014" ]
, Para
[ Str "This"
, Space
, Str "is"
, Space
, Str "a"
, Space
, Str "test"
, Space
, Str "of"
, Space
, Str "how"
, Space
, Str "this"
, Space
, Str "all"
, Space
, Str "works."
, Space
, Str "I\8217ve"
, Space
, Str "skipped"
, Space
, Str "lines"
, Space
, Str "here,"
, Space
, Str "which"
, Space
, Str "pandoc"
, Space
, Str "doesn\8217t"
, Space
, Str "do,"
, Space
, Str "but"
, Space
, Str "which"
, Space
, Str "shouldn\8217t"
, Space
, Str "make"
, Space
, Str "a"
, Space
, Str "difference."
]
]

0 comments on commit d20c61b

Please sign in to comment.