-
Notifications
You must be signed in to change notification settings - Fork 2
/
CbxTokenizer.py
45 lines (37 loc) · 1.25 KB
/
CbxTokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun May 26 10:43:12 2024
@author: cubAIx
"""
import re
class CbxToken:
UNK = 0
WORD = 1
PUNCT = 2
TAG = 3
def __init__(self,token,index):
self.kind = self.TAG if re.match(r'^<[^<>]*>$', token) else self.WORD if re.match(r'^\w+$', token) else self.PUNCT if re.match(r'^[^\w]$', token) else self.UNK
self.token = token
self.index = index
def __repr__(self):
return f"CbxToken(token={self.token})kind={self.kind}"
def __str__(self):
return repr(self)
class CbxTokenizer:
def tokenize_xml(self,text):
# Temporary quick & durty: use a simple regex
tokens = re.findall(r'<[^<>]*>|\w+|&[a-zA-Z]+;|&#[0-9]+;|[^\w]', text)
# Empty tokens filtration
tokens = [CbxToken(token,t) for t,token in enumerate(tokens) if token]
return tokens
def test(self):
# Example
text = "<a href='index.html'>Bonjour!</a> 😁 & Comment ça va C&A ? J'espère que <b>tout</b> va bien."
tokens = self.tokenize_xml(text)
print(tokens)
print("-----")
for t in tokens:
print(f"[{t.kind}][{t.token}]")
print("-----")
# CbxTokenizer().test();