forked from PaddlePaddle/Paddle3D
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcapet_vovnet_800x320_24ep_wocbgs_load_dd3d_pretrain.yml
191 lines (186 loc) · 5.3 KB
/
capet_vovnet_800x320_24ep_wocbgs_load_dd3d_pretrain.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
batch_size: 1
epochs: 24
train_dataset:
type: NuscenesMVDataset
dataset_root: data/nuscenes/
ann_file: data/nuscenes/petr_nuscenes_annotation_train.pkl
mode: train
use_valid_flag: True
class_names: [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
transforms:
- type: LoadMultiViewImageFromFiles
to_float32: True
- type: LoadMultiViewImageFromMultiSweepsFiles
sweeps_num: 1
to_float32: True
pad_empty_sweeps: True
sweep_range: [3, 27]
test_mode: False
- type: LoadAnnotations3D
with_bbox_3d: True
with_label_3d: True
- type: SampleRangeFilter
point_cloud_range: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
- type: SampleNameFilter
classes: [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
- type: ResizeCropFlipImage
sample_aug_cfg:
resize_lim: [0.47, 0.625]
final_dim: [320, 800]
bot_pct_lim: [0.0, 0.0]
rot_lim: [0.0, 0.0]
H: 900
W: 1600
rand_flip: True
training: True
- type: GlobalRotScaleTransImage
rot_range: [-0.3925, 0.3925]
translation_std: [0, 0, 0]
scale_ratio_range: [0.95, 1.05]
reverse_angle: True
training: True
- type: NormalizeMultiviewImage
mean: [103.530, 116.280, 123.675]
std: [57.375, 57.12, 58.395]
- type: PadMultiViewImage
size_divisor: 32
- type: SampleFilerByKey
keys: ['gt_bboxes_3d', 'gt_labels_3d', 'img']
meta_keys: ['filename', 'ori_shape', 'img_shape', 'lidar2img',
'intrinsics', 'extrinsics', 'pad_shape',
'scale_factor', 'flip', 'box_type_3d', 'img_norm_cfg', 'sample_idx',
'timestamp']
val_dataset:
type: NuscenesMVDataset
dataset_root: data/nuscenes/
ann_file: data/nuscenes/petr_nuscenes_annotation_val.pkl
mode: val
class_names: ['car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian',
'traffic_cone']
transforms:
- type: LoadMultiViewImageFromFiles
to_float32: True
- type: LoadMultiViewImageFromMultiSweepsFiles
sweeps_num: 1
to_float32: True
pad_empty_sweeps: True
sweep_range: [3, 27]
- type: ResizeCropFlipImage
sample_aug_cfg:
resize_lim: [0.47, 0.625]
final_dim: [320, 800]
bot_pct_lim: [0.0, 0.0]
rot_lim: [0.0, 0.0]
H: 900
W: 1600
rand_flip: True
training: False
- type: NormalizeMultiviewImage
mean: [103.530, 116.280, 123.675]
std: [57.375, 57.12, 58.395]
- type: PadMultiViewImage
size_divisor: 32
- type: SampleFilerByKey
keys: ['img']
meta_keys: ['filename', 'ori_shape', 'img_shape', 'lidar2img',
'intrinsics', 'extrinsics', 'pad_shape',
'scale_factor', 'flip', 'box_type_3d', 'img_norm_cfg', 'sample_idx',
'timestamp']
optimizer:
type: AdamW
weight_decay: 0.01
grad_clip:
type: ClipGradByGlobalNorm
clip_norm: 35
# auto_skip_clip: True
lr_scheduler:
type: LinearWarmup
learning_rate:
type: CosineAnnealingDecay
learning_rate: 0.0002
T_max: 84408 # 3517 * 24
eta_min: 0.0000002
warmup_steps: 500
start_lr: 0.00006666666
end_lr: 0.0002
model:
type: CAPE
use_recompute: True
use_grid_mask: True
img_backbone:
# use recompute to save memory
type: VoVNetCP
spec_name: V-99-eSE
norm_eval: True
frozen_stages: -1
input_ch: 3
out_features: ('stage4','stage5',)
img_neck:
# remove unused parameters
type: CPFPN
in_channels: [768, 1024]
out_channels: 256
num_outs: 2
pts_bbox_head:
type: CAPETemporalDNHead
num_classes: 10
in_channels: 256
num_query: 900
position_range: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
code_weights: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
normedlinear: False
dn_weight: 1.0
split: 0.75
with_time: True
with_prev_aux_loss: True
prev_aux_loss_weight: 0.1
transformer:
type: CAPETransformer
num_cameras: 6
num_layers: 6
feat_dim: 256
feat_stride: 32
image_height: 320
image_width: 800
bound: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
with_fpe: True
depth_start: 1
depth_num: 64
att_layer:
type: 'CrossViewAttention'
hidden_dim: 512
num_queries: 900
qkv_bias: True
heads: 8
dim_head: 64
conditional: True
scalar: 10, ##noise groups
noise_scale: 1.0
positional_encoding:
type: SinePositionalEncoding3D
num_feats: 128
normalize: True
bbox_coder:
type: NMSFreeCoder
post_center_range: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
point_cloud_range: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
max_num: 300
voxel_size: [0.2, 0.2, 8]
num_classes: 10
loss_cls:
type: WeightedFocalLoss
gamma: 2.0
alpha: 0.25
loss_weight: 2.0
reduction: sum
loss_bbox:
type: WeightedL1Loss
loss_weight: 0.25
reduction: sum