-
Notifications
You must be signed in to change notification settings - Fork 0
/
HTMLDiffPlus.py
244 lines (231 loc) · 11.6 KB
/
HTMLDiffPlus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#-*- coding: UTF-8 -*-
'''
这是根据C#的HTMLDiff库所写的Python版本,而且改良了可以识别汉字、标签、英文单词、数字和左右括号。
步骤1:把旧字符串分词到列表中,再把新字符串分词到列表中,然后把新字符串列表根据词进行索引。
步骤2:把位置索引存到列表中,因此格式相当于一个字典,字符串当做key,位置列表当做value。{string:[1,2,3]}
步骤3:将旧字符串列表的每个词在字典中查询,目的是取得最大长度的匹配。
步骤4:然后通过递归或循环,在最大匹配的左侧和右侧分别找到匹配,直到匹配找完。
步骤5:将新旧字符串列表和匹配进行比较。找到变动处。生成待加工的section列表
步骤6:将待加工的section依次进行加工。
步骤7:最后把加工后的section列表输出
'''
import re
from builtins import print
def isTag(string):
regex = re.compile(r"<[^>]+>")
return True if regex.match(string) else False
def isOpeningTag(string):
regex = re.compile(r"<[^>/]+>")
return True if regex.match(string) else False
class HTMLDiff:
def __init__(self,oldText,newText):
self.oldWords = self.__SplitWords(oldText)
self.newWords = self.__SplitWords(newText)
self.newWordsIndex = {}
self.content = []
self.tagWhiteList = re.compile(r"</?(?:table|tr|td|th|thead|tbody).*>")
def __SplitWords(self,text):
regex = re.compile(r"<[^>]+>|[\u4e00-\u9fa5]|[a-zA-Z\(\)]+|\d+|\s+")
return regex.findall(text)
def IndexNewWords(self):
for i in range(0,self.newWords.__len__()):
if self.newWords[i] not in self.newWordsIndex:
self.newWordsIndex[self.newWords[i]] = []
self.newWordsIndex[self.newWords[i]].append(i)
def __findMaxMatch(self,startInOld, endInOld , startInNew , endInNew):
'''
找到最大匹配
算法如下:
每个旧词查找在新词字典的位置
依次记录旧词在新词的每个位置及已经匹配的长度,记录最大的长度匹配的长度及在旧词列表的位置和在新词列表的位置,并把结果返回
'''
#记录每次循环前一次的单词位置(key)和截止上一次的匹配长度。{int:int}
maxMatch = {"startInOld":0, "startInNew":0, "size":0}
preRecord = {}
for oldindex in range(startInOld, endInOld):
record = {}
if self.oldWords[oldindex] in self.newWordsIndex:
for i in self.newWordsIndex[self.oldWords[oldindex]]:
if i < startInNew:
continue
if i >= endInNew:
break
record[i] = preRecord[i-1]+1 if (i-1) in preRecord else 1
if record[i] > maxMatch["size"]:
maxMatch["startInOld"] = oldindex - record[i] + 1
maxMatch["startInNew"] = i - record[i] + 1
maxMatch["size"] = record[i]
preRecord = record
return maxMatch if maxMatch["size"]>0 else None
def findAllMatches(self,startInOld, endInOld , startInNew , endInNew):
'''
用栈+循环或者递归来取出所有匹配,并将所有匹配返回
本例用栈+循环,这种方式需要在后面进行排序
'''
stack = []
matches = []
tmpBlock = {}
block = {"startInOld": startInOld, "endInOld": endInOld, "startInNew": startInNew, "endInNew": endInNew}
stack.append(block)
while stack.__len__() > 0:
block = stack.pop()
match = self.__findMaxMatch(block["startInOld"], block["endInOld"], block["startInNew"], block["endInNew"])
if match is not None:
matches.append(match)
matchOldEnd = match["startInOld"]+match["size"]
matchNewEnd = match["startInNew"]+match["size"]
if block["endInOld"] > matchOldEnd and block["endInNew"] > matchNewEnd:
tmpBlock["startInOld"] = matchOldEnd
tmpBlock["endInOld"] = block["endInOld"]
tmpBlock["startInNew"] = matchNewEnd
tmpBlock["endInNew"] = block["endInNew"]
stack.append(dict(tmpBlock))
if block["startInOld"] < match["startInOld"] and block["startInNew"] < match["startInNew"]:
tmpBlock["startInOld"] = block["startInOld"]
tmpBlock["endInOld"] = match["startInOld"]
tmpBlock["startInNew"] = block["startInNew"]
tmpBlock["endInNew"] = match["startInNew"]
stack.append(dict(tmpBlock))
return matches
def diff(self):
'''
进行字符串diff的方法
'''
sections = [] #操作块
posInOld = 0
posInNew = 0
self.IndexNewWords()
matches = self.findAllMatches(0, self.oldWords.__len__(), 0, self.newWords.__len__())
sortedmatches = sorted(matches, key=lambda d: d["startInOld"])
#添加一个收尾标记,使最后一个match之后的文本也参与运算
sortedmatches.append({"startInOld": self.oldWords.__len__(), "startInNew": self.newWords.__len__(), "size": 0})
for match in sortedmatches:
matchOldEnd = match["startInOld"]+match["size"]
matchNewEnd = match["startInNew"]+match["size"]
if posInOld == match["startInOld"]:
if posInNew == match["startInNew"]:
pass
else:
sections.append(("diffInsert", match["startInOld"], matchOldEnd, posInNew, match["startInNew"]))
else:
if posInNew == match["startInNew"]:
sections.append(("diffDelete", posInOld, match["startInOld"], match["startInNew"], matchNewEnd))
else:
sections.append(("diffReplace", posInOld, match["startInOld"], posInNew, match["startInNew"]))
sections.append(("none", match["startInOld"], matchOldEnd, match["startInNew"], matchNewEnd))
posInOld = match["startInOld"]+match["size"]
posInNew = match["startInNew"]+match["size"]
#将sections中的数据依次进行处理
for section in sections:
if section[0] == "none":
self.content.append("".join(self.newWords[section[3]:section[4]]))
elif section[0] == "diffInsert":
self.__wrapTag(section[0],self.newWords,section[3],section[4])
elif section[0] == "diffDelete":
self.__wrapTag(section[0],self.oldWords,section[1],section[2])
elif section[0] == "diffReplace":
self.__wrapTag("diffDelete",self.oldWords,section[1],section[2])
self.__wrapTag("diffInsert",self.newWords,section[3],section[4])
else:
print("error")
content = "".join(self.content)
return content
def killTagOutOfWhiteList(self,tags):
'''
过滤标签白名单(self.tagWhiteList)外的标签
'''
for i in range(0,tags.__len__()):
if self.tagWhiteList.match(tags[i]) is None:
tags[i]=""
def __wrapTag(self, cssClass, words, startpos, endpos):
'''
包裹标签,如果在self.__splitTagAndWords返回的结果中是文字则在文字两边包裹标签
如果是标签列表,并且第一个标签是开始标签,则列表后面加一个<span>
如果是标签列表,并且第一个标签是结束标签,则列表之前加</span>
如果是标签列表,并且cssClass是diffDelete,说明是该标签在旧字符串上,而不再新字符串上,这是就把标签进行过滤
最后把加工好的内容合并为字符串,放到self.content里待输出
'''
for isTagMode, words in self.__splitTagAndWords(words,startpos,endpos):
if isTagMode :
if isOpeningTag(words[0]):
if cssClass == "diffDelete":
self.killTagOutOfWhiteList(words)
words.append(r'<span class="{0}">'.format("diffTag"))
self.content.append("".join(words))
else:
if cssClass == "diffDelete":
self.killTagOutOfWhiteList(words)
self.content.append(r'</span>'+("".join(words)))
else:
words.append(r'</span>')
self.content.append(r'<span class="{0}">'.format(cssClass)+("".join(words)))
def __splitTagAndWords(self, words, startpos, endpos):
'''
将待处理的区块中连续非标签内容和连续的标签内容分别输出出来
'''
isTagMode = isTag(words[startpos])
tmp = startpos
for pos in range(startpos, endpos):
if isTag(words[pos]) == isTagMode:
pass
else:
yield (isTagMode,words[tmp:pos])
isTagMode = not isTagMode
tmp = pos
if tmp < endpos:
yield (isTagMode,words[tmp:endpos])
if __name__ == "__main__":
oldold = r'''<p><i>This is</i> some sample text to <strong>demonstrate</strong> the capability of the <strong>HTML diff tool</strong>.</p>
<p>It is based on the <b>Ruby</b> implementation found <a href='http://github.com/myobie/htmldiff'>here</a>. Note how the link has no tooltip</p>
<table cellpadding='0' cellspacing='0'>
<tr><td>Some sample text</td><td>Some sample value</td></tr>
<tr><td>Data 1 (this row will be removed)</td><td>Data 2</td></tr>
</table>
Here is a number 2 32'''
newnew = r'''<p>This is some sample <strong>text to</strong> demonstrate the awesome capabilities of the <strong>HTML <u>diff</u> tool</strong>.</p><br/><br/>Extra spacing here that was not here before.
<p>It is <i>based</i> on the Ruby implementation found <a title='Cool tooltip' href='http://github.com/myobie/htmldiff'>here</a>. Note how the link has a tooltip now and the HTML diff algorithm has preserved formatting.</p>
<table cellpadding='0' cellspacing='0'>
<tr><td>Some sample <strong>bold text</strong></td><td>Some sample value</td></tr>
</table>
Here is a number 2 <sup>32</sup>'''
hdiff = HTMLDiff(oldold,newnew)
html = open("diffplus.html","wb+")
body = '''
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<style>
table {{
border:1px solid #d9d9d9;
}}
td {{
border:1px solid #d9d9d9;
padding:3px;
}}
.diffTag{{
background: yellow
}}
.diffInsert{{
background: green;
color : white;
}}
.diffDelete{{
text-decoration: line-through;
color : grey;
}}
</style>
</head>
<body>
<div>{0}</div>
<hr/>
<div>{1}</div>
<hr/>
<div>{2}</div>
<body>
</html>
'''
mydiff = body.format(oldold,newnew,hdiff.diff())
html.write(mydiff.encode("utf-8"))
html.close()