forked from WenmuZhou/TableGeneration
-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_data.py
119 lines (111 loc) · 3.7 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Description: generate_data
"""
import argparse
import sys
from TableGeneration.GenerateTable import GenerateTable
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--num", type=int, default=1, help="the number of generate table"
)
# output path
parser.add_argument(
"--output", type=str, default="output/simple_table"
) # data save path
# courp path
parser.add_argument("--ch_dict_path", type=str, default="dict/ch_news.txt")
parser.add_argument("--en_dict_path", type=str, default="dict/en_corpus.txt")
# table settings
# cell box type
parser.add_argument(
"--cell_box_type",
type=str,
default="cell",
help="cell: use cell location as cell box; text: use location of text in cell as cell box",
)
# row and col
parser.add_argument("--min_row", type=int, default=3, help="min rows in table")
parser.add_argument("--max_row", type=int, default=15, help="max rows in table")
parser.add_argument("--min_col", type=int, default=3, help="min cols in table")
parser.add_argument("--max_col", type=int, default=10, help="max cols in table")
# row and col span
parser.add_argument(
"--max_span_row_count", type=int, default=3, help="max span rows"
)
parser.add_argument(
"--max_span_col_count", type=int, default=3, help="max span cols"
)
parser.add_argument(
"--max_span_value",
type=int,
default=10,
help="max value in rowspan and colspan",
)
# txts lens
parser.add_argument(
"--min_txt_len", type=int, default=2, help="min number of char in cell"
)
parser.add_argument(
"--max_txt_len", type=int, default=10, help="max number of char in cell"
)
# color
parser.add_argument(
"--color_prob", type=float, default=0, help="the prob of color cell"
)
# cell size
parser.add_argument(
"--cell_max_width", type=int, default=0, help="max width of cell"
)
parser.add_argument(
"--cell_max_height", type=int, default=0, help="max height of cell"
)
# windows size
parser.add_argument(
"--brower_width", type=int, default=2440, help="width of brower"
)
parser.add_argument(
"--brower_height", type=int, default=2440, help="height of brower"
)
parser.add_argument(
"--brower", type=str, default="chrome", help="chrome or firefox"
)
# backgroud
parser.add_argument(
"--backgroud",
type=str,
default="gaussian_noise",
help="Create a background with Gaussian noise (to mimic paper)",
)
args = parser.parse_args()
if args.brower == "chrome" and sys.platform == "darwin":
print("firefox is recommend for Mac OS, bug you choice is chrome")
sys.exit(0)
return args
if __name__ == "__main__":
args = parse_args()
print(args)
t = GenerateTable(
output=args.output,
ch_dict_path=args.ch_dict_path,
en_dict_path=args.en_dict_path,
cell_box_type=args.cell_box_type,
min_row=args.min_row,
max_row=args.max_row,
min_col=args.min_col,
max_col=args.max_col,
min_txt_len=args.min_txt_len,
max_txt_len=args.max_txt_len,
max_span_row_count=args.max_span_row_count,
max_span_col_count=args.max_span_col_count,
max_span_value=args.max_span_value,
color_prob=args.color_prob,
cell_max_width=args.cell_max_width,
cell_max_height=args.cell_max_height,
brower=args.brower,
brower_width=args.brower_width,
brower_height=args.brower_height,
backgroud = args.backgroud
)
t.gen_table_img(args.num)
t.close()