Support style settings

zakahan · Dec 22, 2024 · 9e0aecd · 9e0aecd
1 parent 98db2ed
commit 9e0aecd
Show file tree

Hide file tree

Showing 8 changed files with 142 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+/example/
+/tmp/
diff --git a/README.md b/README.md
@@ -9,12 +9,10 @@ English
 docx2md: convert docx to markdown.
 
 This project implements the conversion from DOCX to Markdown, achieved through XML matching.
-> Tips: The heading styles are determined solely by font size. 
-> In the current version, the principle is to find the `w:sz` tag settings in `Document.xml`. 
-> Some .docx documents that are relatively standardized may set font sizes via styles 
-> (the font size information is located in `Style.xml`), which cannot be recognized in the current version. 
-> This issue might be addressed in future updates.
 
+>Tips: The current version already supports recognition of heading styles, but it identifies the level based on the font size of the style rather than the "level" name. 
+> For a detailed correspondence between font sizes and Heading levels, please see the introduction below.
+> Therefore, even if a style is named Heading 1, it will not be processed as a first-level heading unless it matches the font size designated for Heading 1.
 
 
 ## Quick Start
@@ -69,6 +67,7 @@ and some images file.
 
 ## Word2Heading
 
+Note: The conversion relationship is as follows: Word's font size * 2 = the val of w:fz in the markup = pt value.
 
 According to practical conditions, most documents do not strictly follow the Table of Contents (TOC) or outline settings;
 therefore, we will not rely on the TOC for this process. Instead, it will completely depend on the font size.
@@ -106,3 +105,6 @@ or if there is a w:numPr tag (Word's own numbering tag), then the limit can be r
 and allow the appearance of h4 level headers.
 
 
+### Lastest Update
+
+- [x] Support for recognizing titles set via style has been added. For titles set through Word styles, the system can now recognize these titles based on their style settings.
diff --git a/README_CN.md b/README_CN.md
@@ -10,10 +10,9 @@ docx2md: 将docx文件转为markdown
 
 本项目实现了将docx转化为Markdown
 
-> Tips，标题样式仅通过字体大小确定，
-> 且目前版本原理为查找`Document.xml`中的`w:sz`标记设置，
-> 有部分规整的.docx文档可能通过样式设置字体大小（即字体大小信息位于`Style.xml`中），
-> 在目前版本无法识别，可能会在后续版本进行更新。
+> Tips: 现有版本已经支持标题样式识别，但不会根据样式“级别”，而是根据样式的字体大小来识别级别。
+> 字体大小与Heading级别对应请见下方详细介绍
+> 因此即使样式名称为Heading 1，也不会将其处理为一级标题。
 
 ## 快速开始
 
@@ -66,6 +65,7 @@ func main() {
 
 ## Word2Heading
 
+注：换算关系 word的字号*2 = 标记中w:fz的val = pt值
 
 根据实际情况，大部分文档并未严格遵循目录（TOC）或大纲设置，因此本转换过程不依赖于TOC，而是完全依据段落的字体大小来确定标题级别。
 
@@ -95,3 +95,8 @@ func main() {
 ### 特殊情况处理
 
 此外，如果段落以编号或特定标记（如“一、二、三”或“第xxxx”或带有Word自身的编号标记`w:numPr`）开头，那么我们可以放宽对\<h3\>的长度限制至15个汉字（= 45字符），并且允许出现\<h4\>级别的标题。
+
+
+### 最新更新
+
+- [x] 支持了样式设置标题识别，对于通过word样式设置的标题，目前已经可以实现识别功能。
diff --git a/docx_parser/classes.go b/docx_parser/classes.go
@@ -16,8 +16,13 @@ type Body struct {
 }
 
 type Paragraph struct { // 段落类型 w:p
-	Runs  []Run `xml:"r"`         // 段落包含多个 run
-	NumPr *bool `xml:"pPr>numPr"` // 检查是否存在编号信息
+	Runs    []Run  `xml:"r"`          // 段落包含多个 run
+	NumPr   *bool  `xml:"pPr>numPr"`  // 检查是否存在编号信息
+	StyleId PStyle `xml:"pPr>pStyle"` // 段落样式
+}
+
+type PStyle struct { // 段落样式
+	Value string `xml:"val,attr"`
 }
 
 type Run struct { // 文本运行，可能包含文本或图片
@@ -70,3 +75,21 @@ type Relationship struct {
 	Type   string `xml:"Type,attr"`
 	Target string `xml:"Target,attr"`
 }
+
+/* -------------------------------------------------------------- */
+
+// Styles 样式表
+type Styles struct {
+	XMLName   xml.Name
+	StyleList []Style `xml:"style"`
+}
+
+type Style struct {
+	Name     Name     `xml:"name"`
+	StyleId  string   `xml:"styleId,attr"`
+	FontSize FontSize `xml:"rPr>sz"`
+}
+
+type Name struct {
+	Value string `xml:"val,attr"`
+}
diff --git a/docx_parser/docx_reader.go b/docx_parser/docx_reader.go
@@ -51,6 +51,13 @@ func ReadDocx(filePath string, outputFileDir string) (*Document, error) {
 		return nil, err
 	}
 
+	/* ----------------------------------------------------------------------------- */
+	// 解析style文件
+	stylesList, err := ReadStyle(r, filePath)
+	if err != nil {
+		return nil, err
+	}
+	fmt.Println(stylesList)
 	/* ----------------------------------------------------------------------------- */
 
 	// 查找 document.xml
@@ -134,5 +141,7 @@ func ReadDocx(filePath string, outputFileDir string) (*Document, error) {
 		}
 	}
 
+	// 结合styles和body，修改font size
+	stylizedBody(&body, stylesList)
 	return &Document{Body: body}, nil
 }
diff --git a/docx_parser/style_classes.go b/docx_parser/style_classes.go
@@ -0,0 +1,7 @@
+// -------------------------------------------------
+// Package docx_parser
+// Author: hanzhi
+// Date: 2024/12/22
+// -------------------------------------------------
+
+package docx_parser
diff --git a/docx_parser/style_reader.go b/docx_parser/style_reader.go
@@ -0,0 +1,48 @@
+// -------------------------------------------------
+// Package docx_parser
+// Author: hanzhi
+// Date: 2024/12/22
+// -------------------------------------------------
+
+package docx_parser
+
+import (
+	"archive/zip"
+	"encoding/xml"
+	"fmt"
+	"io"
+)
+
+func ReadStyle(r *zip.ReadCloser, filePath string) (*Styles, error) {
+	// 查找 document.xml.rels文件，也就是多媒体依赖
+	var styleFileRels *zip.File
+	for _, f := range r.File {
+		if f.Name == "word/styles.xml" {
+			styleFileRels = f
+			break
+		}
+	}
+
+	if styleFileRels == nil {
+		return nil, fmt.Errorf("styles.xml not found in %s", filePath)
+	}
+	// 读取style.xml的内容
+	rcDFR, err := styleFileRels.Open()
+	if err != nil {
+		return nil, err
+	}
+	defer func(rc io.ReadCloser) {
+		err := rc.Close()
+		if err != nil {
+			// empty
+		}
+	}(rcDFR)
+	// 解析
+	var stylesList Styles
+	err = xml.NewDecoder(rcDFR).Decode(&stylesList)
+	if err != nil {
+		return nil, err
+	}
+
+	return &stylesList, nil
+}
diff --git a/docx_parser/stylized_body.go b/docx_parser/stylized_body.go
@@ -0,0 +1,35 @@
+// -------------------------------------------------
+// Package docx_parser
+// Author: hanzhi
+// Date: 2024/12/22
+// -------------------------------------------------
+
+package docx_parser
+
+func stylizedBody(body *Body, styles *Styles) {
+	// 根据styles生成一个map方便我查找
+	var styleFZMap map[string]int = make(map[string]int)
+	for _, style := range styles.StyleList {
+		if style.StyleId != "" {
+			styleFZMap[style.StyleId] = style.FontSize.Value
+		}
+	}
+	//fmt.Println(styleFZMap)
+	// 遍历body，寻找paragraph
+	for i, content := range body.Contents {
+		if content.Type == "paragraph" {
+			paragraph := content.Value.(Paragraph)
+			if paragraph.StyleId.Value != "" {
+				if fontSize, exists := styleFZMap[paragraph.StyleId.Value]; exists {
+					for j := range paragraph.Runs {
+						paragraph.Runs[j].FontSize.Value = fontSize
+					}
+				}
+			}
+			// 写回到 body.Contents[i]
+			body.Contents[i].Value = paragraph
+		}
+	}
+
+	//fmt.Println(styleFZMap)
+}